From c9aee5964a977e6ca268704119f589bfb0695b54 Mon Sep 17 00:00:00 2001
From: Jonathan Swartz <jonathan@jswartz.info>
Date: Wed, 21 Aug 2024 10:47:20 +1200
Subject: [PATCH] Formatting changes to match OpenVDB guidelines Added examples
 Updated README and environment/docker updates

Signed-off-by: Jonathan Swartz <jonathan@jswartz.info>
---
 fvdb/Dockerfile                               |   57 +-
 fvdb/README.md                                |   74 +-
 fvdb/docs/conf.py                             |   32 +-
 fvdb/env/build_environment.yml                |   15 +-
 fvdb/env/learn_environment.yml                |    1 +
 fvdb/env/test_environment.yml                 |   17 +-
 fvdb/examples/common.py                       |  190 +
 fvdb/examples/compare_conv_speed.py           |   75 +
 fvdb/examples/grid_building.py                |  155 +
 fvdb/examples/grid_subdivide_coarsen.py       |   56 +
 fvdb/examples/mutable_grids.py                |  107 +
 fvdb/examples/overfit_sdf.py                  |  116 +
 fvdb/examples/ray_segment_marching.py         |  102 +
 fvdb/examples/ray_voxel_marching.py           |  100 +
 fvdb/examples/sample_trilinear.py             |   67 +
 fvdb/examples/splat_trilinear.py              |   54 +
 fvdb/examples/subdivide.py                    |   74 +
 fvdb/examples/uniform_ray_marching.py         |  124 +
 fvdb/examples/voxel_neighborhood.py           |   49 +
 fvdb/fvdb/_Cpp.pyi                            |  415 +-
 fvdb/fvdb/__init__.py                         |   24 +-
 fvdb/fvdb/nn/modules.py                       |  100 +-
 fvdb/fvdb/nn/vdbtensor.py                     |   24 +-
 fvdb/fvdb/utils/__init__.py                   |    2 +-
 fvdb/fvdb/utils/build_ext.py                  |   38 +-
 fvdb/scripts/rename_wheels.py                 |    6 +-
 fvdb/setup.py                                 |  211 +-
 fvdb/src/Config.cpp                           |   15 +-
 fvdb/src/Config.h                             |   17 +-
 fvdb/src/FVDB.cpp                             |  184 +-
 fvdb/src/FVDB.h                               |  259 +-
 fvdb/src/GridBatch.cpp                        | 1173 +--
 fvdb/src/GridBatch.h                          |  857 ++-
 fvdb/src/JaggedTensor.cpp                     |  980 ++-
 fvdb/src/JaggedTensor.h                       |  690 +-
 fvdb/src/SparseConvPackInfo.cpp               |  262 +-
 fvdb/src/SparseConvPackInfo.h                 |  197 +-
 fvdb/src/Types.h                              |  128 +-
 fvdb/src/detail/GridBatchImpl.cu              |  470 +-
 fvdb/src/detail/GridBatchImpl.h               |  438 +-
 fvdb/src/detail/TorchDeviceBuffer.cpp         |  173 +-
 fvdb/src/detail/TorchDeviceBuffer.h           |  115 +-
 fvdb/src/detail/TypesImpl.h                   |  218 +-
 fvdb/src/detail/VoxelCoordTransform.h         |  202 +-
 fvdb/src/detail/autograd/Attention.cpp        |   27 +-
 fvdb/src/detail/autograd/Attention.h          |   28 +-
 fvdb/src/detail/autograd/Autograd.h           |   25 +-
 fvdb/src/detail/autograd/AvgPoolGrid.cpp      |   78 +-
 fvdb/src/detail/autograd/AvgPoolGrid.h        |   23 +-
 fvdb/src/detail/autograd/FillToGrid.h         |   71 +-
 fvdb/src/detail/autograd/JaggedReduce.cpp     |  110 +-
 fvdb/src/detail/autograd/JaggedReduce.h       |   40 +-
 fvdb/src/detail/autograd/MaxPoolGrid.cpp      |   78 +-
 fvdb/src/detail/autograd/MaxPoolGrid.h        |   25 +-
 fvdb/src/detail/autograd/ReadFromDense.h      |   81 +-
 fvdb/src/detail/autograd/ReadIntoDense.cpp    |   96 +-
 fvdb/src/detail/autograd/ReadIntoDense.h      |   23 +-
 fvdb/src/detail/autograd/SampleGrid.cpp       |  141 +-
 fvdb/src/detail/autograd/SampleGrid.h         |   42 +-
 .../detail/autograd/SparseConvolutionHalo.cpp |   76 +-
 .../detail/autograd/SparseConvolutionHalo.h   |   24 +-
 .../autograd/SparseConvolutionImplicitGEMM.h  |  206 +-
 .../autograd/SparseConvolutionKernelMap.h     |  160 +-
 fvdb/src/detail/autograd/SplatIntoGrid.cpp    |  106 +-
 fvdb/src/detail/autograd/SplatIntoGrid.h      |   39 +-
 fvdb/src/detail/autograd/TransformPoints.cpp  |   64 +-
 fvdb/src/detail/autograd/TransformPoints.h    |   26 +-
 fvdb/src/detail/autograd/UpsampleGrid.cpp     |   76 +-
 fvdb/src/detail/autograd/UpsampleGrid.h       |   22 +-
 fvdb/src/detail/autograd/VolumeRender.cpp     |  108 +-
 fvdb/src/detail/autograd/VolumeRender.h       |   28 +-
 fvdb/src/detail/build/Build.h                 |  114 +-
 fvdb/src/detail/build/CoarseFromFine.cpp      |   43 +-
 fvdb/src/detail/build/ConvGrid.cpp            |   83 +-
 fvdb/src/detail/build/DenseGrid.cpp           |   60 +-
 fvdb/src/detail/build/EmptyGrid.cpp           |   17 +-
 fvdb/src/detail/build/FineFromCoarse.cpp      |   49 +-
 fvdb/src/detail/build/FromMesh.cpp            |   75 +-
 .../build/NearestNeighborGridFromPoints.cpp   |  145 +-
 .../src/detail/build/PaddedGridFromCoords.cpp |  116 +-
 fvdb/src/detail/build/PaddedGridFromGrid.cpp  |   65 +-
 .../src/detail/build/PaddedGridFromPoints.cpp |  131 +-
 fvdb/src/detail/io/IO.h                       |   48 +-
 fvdb/src/detail/io/LoadNanovdb.cpp            |  646 +-
 fvdb/src/detail/io/SaveNanoVDB.cpp            |  402 +-
 fvdb/src/detail/ops/ActiveGridGoords.cu       |  113 +-
 .../detail/ops/ActiveVoxelsInBoundsMask.cu    |  171 +-
 fvdb/src/detail/ops/BuildDeviceGrid.cu        |  297 +-
 fvdb/src/detail/ops/CoordsInGrid.cu           |   72 +-
 fvdb/src/detail/ops/CountEnabledVoxels.cu     |   89 +-
 fvdb/src/detail/ops/CubesInGrid.cu            |  151 +-
 fvdb/src/detail/ops/DownsampleGridAvgPool.cu  |  316 +-
 fvdb/src/detail/ops/DownsampleGridMaxPool.cu  |  299 +-
 fvdb/src/detail/ops/EnabledMask.cu            |   55 +-
 fvdb/src/detail/ops/FillToGrid.cu             |   98 +-
 fvdb/src/detail/ops/GridEdgeNetwork.cu        |  138 +-
 fvdb/src/detail/ops/IjkToIndex.cu             |   70 +-
 fvdb/src/detail/ops/IjkToInvIndex.cu          |   74 +-
 fvdb/src/detail/ops/JCat0.cu                  |  168 +-
 fvdb/src/detail/ops/JIdxForGrid.cu            |   65 +-
 fvdb/src/detail/ops/JIdxForJOffsets.cu        |   50 +-
 fvdb/src/detail/ops/JOffsetsFromJIdx.cu       |   98 +-
 fvdb/src/detail/ops/JaggedTensorIndex.cu      |  185 +-
 fvdb/src/detail/ops/MarchingCubes.cu          |  346 +-
 fvdb/src/detail/ops/Ops.h                     |  439 +-
 fvdb/src/detail/ops/PaddedIJKForMesh.cu       |  235 +-
 fvdb/src/detail/ops/PointsInGrid.cu           |   86 +-
 .../src/detail/ops/RayImplicitIntersection.cu |  179 +-
 fvdb/src/detail/ops/ReadFromDense.cu          |  153 +-
 fvdb/src/detail/ops/ReadIntoDense.cu          |  139 +-
 fvdb/src/detail/ops/SampleGridBezier.cu       |  121 +-
 .../detail/ops/SampleGridBezierWithGrad.cu    |  147 +-
 .../ops/SampleGridBezierWithGradBackward.cu   |  141 +-
 fvdb/src/detail/ops/SampleGridTrilinear.cu    |  117 +-
 .../detail/ops/SampleGridTrilinearWithGrad.cu |  135 +-
 .../SampleGridTrilinearWithGradBackward.cu    |  141 +-
 fvdb/src/detail/ops/SampleRaysUniform.cu      |  434 +-
 .../detail/ops/ScaledDotProductAttention.cu   |  191 +-
 fvdb/src/detail/ops/SegmentsAlongRays.cu      |  358 +-
 fvdb/src/detail/ops/SetMasked.cu              |   70 +-
 fvdb/src/detail/ops/SplatIntoGridBezier.cu    |  148 +-
 fvdb/src/detail/ops/SplatIntoGridTrilinear.cu |  147 +-
 fvdb/src/detail/ops/TransformPointToGrid.cu   |  270 +-
 fvdb/src/detail/ops/UpsampleGridNearest.cu    |  297 +-
 fvdb/src/detail/ops/VolumeRender.cu           |  551 +-
 fvdb/src/detail/ops/VoxelNeighborhood.cu      |  102 +-
 fvdb/src/detail/ops/VoxelsAlongRays.cu        |  419 +-
 fvdb/src/detail/ops/VoxelsForGridBuilding.cu  |  707 +-
 .../detail/ops/convolution/backend/ConvOps.h  |  103 +-
 .../backend/MESparseConvolution.cu            | 1322 ++--
 .../backend/SparseConvolutionCutlass.cu       |  692 +-
 .../backend/SparseConvolutionHalo.cu          |  376 +-
 .../backend/SparseConvolutionHaloGrad.cu      |  355 +-
 .../backend/SparseConvolutionImplicitGEMM.cu  | 5782 ++++++++-------
 .../SparseConvolutionImplicitGEMMGrad.cu      | 5965 ++++++++-------
 ...SparseConvolutionImplicitGEMMGradSorted.cu | 6229 ++++++++--------
 .../SparseConvolutionImplicitGEMMSorted.cu    | 6534 +++++++++--------
 .../backend/SparseConvolutionKernelMap.cu     |  505 +-
 .../backend/SparseConvolutionLggs.cu          |  231 +-
 .../convolution/pack_info/BrickHaloBuffer.cu  |  172 +-
 .../pack_info/ConvolutionKernelMap.cu         |  132 +-
 .../pack_info/IGEMMBitOperations.cu           |  125 +-
 .../ops/convolution/pack_info/PackInfoOps.h   |   36 +-
 fvdb/src/detail/ops/jagged/JaggedOps.h        |   25 +-
 fvdb/src/detail/ops/jagged/JaggedReduce.cu    |  117 +-
 fvdb/src/detail/ops/jagged/JaggedSort.cu      |  110 +-
 .../utils/BezierInterpolationIterator.h       |   84 +-
 .../BezierInterpolationWithGradIterator.h     |   86 +-
 fvdb/src/detail/utils/MarchingCubesData.h     |  468 +-
 .../utils/TrilinearInterpolationIterator.h    |   87 +-
 .../TrilinearInterpolationWithGradIterator.h  |  129 +-
 fvdb/src/detail/utils/Utils.h                 |  271 +-
 fvdb/src/detail/utils/cuda/Atomics.cuh        |  514 +-
 fvdb/src/detail/utils/cuda/Utils.cuh          |  481 +-
 .../utils/nanovdb/ActiveVoxelIterator.h       |  164 +-
 .../detail/utils/nanovdb/CustomAccessors.h    |  254 +-
 fvdb/src/detail/utils/nanovdb/HDDAIterators.h |  191 +-
 fvdb/src/detail/utils/nanovdb/Printing.h      |   17 +-
 .../utils/nanovdb/TorchNanoConversions.h      |   81 +-
 fvdb/src/python/Bindings.cpp                  |  236 +-
 fvdb/src/python/GridBatchBinding.cpp          |  417 +-
 fvdb/src/python/JaggedTensorBinding.cpp       |   16 +-
 fvdb/src/python/TypeCasters.h                 |  197 +-
 fvdb/tests/benchmark/comparative_benchmark.py |   52 +-
 fvdb/tests/benchmark/conftest.py              |    1 +
 .../tests/benchmark/fvdb_benchmark/configs.py |   69 +-
 .../tests/benchmark/fvdb_benchmark/dataset.py |   32 +-
 .../fvdb_benchmark/model/minkunet.py          |  143 +-
 .../benchmark/fvdb_benchmark/model/updown.py  |   10 +-
 .../benchmark/fvdb_benchmark/model/xcube.py   |  123 +-
 fvdb/tests/benchmark/fvdb_benchmark/utils.py  |   15 +-
 .../tests/benchmark/fvdb_benchmark/wrapper.py |  158 +-
 fvdb/tests/benchmark/test_conv.py             |   16 +-
 fvdb/tests/unit/common.py                     |   32 +-
 fvdb/tests/unit/nkfw_api/backend/__init__.py  |    7 +-
 fvdb/tests/unit/nkfw_api/backend/abc.py       |   68 +-
 fvdb/tests/unit/nkfw_api/backend/fvdb.py      |  161 +-
 .../tests/unit/nkfw_api/backend/hash_table.py |  239 +-
 fvdb/tests/unit/nkfw_api/ext/__init__.py      |   16 +-
 fvdb/tests/unit/test_accessors.py             |   15 +-
 fvdb/tests/unit/test_basic_ops.py             |  543 +-
 fvdb/tests/unit/test_batching.py              |  300 +-
 fvdb/tests/unit/test_conv.py                  |  442 +-
 fvdb/tests/unit/test_dense_interface.py       |  143 +-
 fvdb/tests/unit/test_dual.py                  |   47 +-
 fvdb/tests/unit/test_empty_grids.py           |  154 +-
 fvdb/tests/unit/test_io.py                    |  167 +-
 fvdb/tests/unit/test_jagged_tensor.py         |  344 +-
 fvdb/tests/unit/test_mutable_grids.py         |   30 +-
 fvdb/tests/unit/test_nkfw_api.py              |  309 +-
 fvdb/tests/unit/test_nn.py                    |  179 +-
 fvdb/tests/unit/test_ray_marching.py          |  197 +-
 fvdb/tests/unit/test_sample.py                |  485 +-
 193 files changed, 32114 insertions(+), 25949 deletions(-)
 create mode 100644 fvdb/examples/common.py
 create mode 100644 fvdb/examples/compare_conv_speed.py
 create mode 100644 fvdb/examples/grid_building.py
 create mode 100644 fvdb/examples/grid_subdivide_coarsen.py
 create mode 100644 fvdb/examples/mutable_grids.py
 create mode 100644 fvdb/examples/overfit_sdf.py
 create mode 100644 fvdb/examples/ray_segment_marching.py
 create mode 100644 fvdb/examples/ray_voxel_marching.py
 create mode 100644 fvdb/examples/sample_trilinear.py
 create mode 100644 fvdb/examples/splat_trilinear.py
 create mode 100644 fvdb/examples/subdivide.py
 create mode 100644 fvdb/examples/uniform_ray_marching.py
 create mode 100644 fvdb/examples/voxel_neighborhood.py

diff --git a/fvdb/Dockerfile b/fvdb/Dockerfile
index 4b3aa750dc..8485af1e77 100644
--- a/fvdb/Dockerfile
+++ b/fvdb/Dockerfile
@@ -1,52 +1,17 @@
-ARG CUDA_VERSION=12.1.1
-ARG CUDNN_VERSION=8
+FROM nvcr.io/nvidia/pytorch:24.04-py3
 
-FROM nvidia/cuda:${CUDA_VERSION}-cudnn${CUDNN_VERSION}-devel-ubuntu20.04
-
-ENV PATH /usr/local/cuda/bin:$PATH
-ENV LD_LIBRARY_PATH /usr/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib:${LD_LIBRARY_PATH}
-
-# # nvidia-container-runtime
-ENV NVIDIA_VISIBLE_DEVICES all
-ENV NVIDIA_DRIVER_CAPABILITIES compute,utility,graphics
-
-RUN echo "Acquire { https::Verify-Peer false }" > /etc/apt/apt.conf.d/99verify-peer.conf \
-    && if [ -f /etc/apt/sources.list.d/cuda.list ]; then \
-        rm /etc/apt/sources.list.d/cuda.list; \
-    fi \
-    && if [ -f /etc/apt/sources.list.d/nvidia-ml.list ]; then \
-        rm /etc/apt/sources.list.d/nvidia-ml.list; \
-    fi \
-    && apt-get update \
-    && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated ca-certificates \
-    && rm /etc/apt/apt.conf.d/99verify-peer.conf \
-    && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
-		wget \
-		rsync \
-		vim \
-		git \
-    	curl \
-		ninja-build \
-		cmake \
-		build-essential \
-		xauth \
-		openssh-server \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
-
-RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && \
-    bash ~/miniconda.sh -b -p /opt/conda && \
-    rm ~/miniconda.sh
-
-ENV PATH /opt/conda/bin:$PATH
-ENV TORCH_CUDA_ARCH_LIST "6.1;7.0;7.5;8.0;8.6+PTX"
+ARG MODE=production
+RUN echo "Building fVDB container in $MODE mode"
 
 # used for cross-compilation in docker build
 ENV FORCE_CUDA=1
 
 WORKDIR /fvdb
-COPY env/test_environment.yml .
-
-RUN /opt/conda/bin/conda env create -f test_environment.yml \
-    && /opt/conda/bin/conda clean -ya \
-    && /opt/conda/bin/conda init bash
+COPY . .
+RUN  pip install --no-cache-dir -r env/build_requirements.txt
+
+RUN if [ "$MODE" = "production" ]; then \
+     MAX_JOBS=$(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}')  \
+     TORCH_CUDA_ARCH_LIST="6.1;7.0;7.5;8.0;8.6+PTX" \
+     python setup.py install; \
+    fi
\ No newline at end of file
diff --git a/fvdb/README.md b/fvdb/README.md
index f9429145cc..36d50052d8 100644
--- a/fvdb/README.md
+++ b/fvdb/README.md
@@ -46,7 +46,62 @@ conda activate fvdb_learn
 
 
 ## Building *f*VDB from Source
-*f*VDB is a Python library implemented as a C++ Pytorch extension.
+
+### Environment Management
+ƒVDB is a Python library implemented as a C++ Pytorch extension.  Of course you can build ƒVDB in whatever environment suits you, but we provide two paths to constructing reliable environments for building and running ƒVDB:  using [docker](#setting-up-a-docker-container) and using [conda](#setting-up-a-conda-environment).
+
+`conda` tends to be more flexible since reconfiguring toolchains and modules to suit your larger project can be dynamic, but at the same time this can be a more brittle experience compared to using a virtualized `docker` container.  Using `conda` is generally recommended for development and testing, while using `docker` is recommended for CI/CD and deployment.
+
+#### Setting up a Docker Container
+
+Running a docker container is a great way to ensure that you have a consistent environment for building and running ƒVDB.
+
+Our provided [`Dockerfile`](Dockerfile) has two modes for building the image: `dev` and `production`.  `production` constructs an image capable of building ƒVDB, builds and installs the ƒVDB libraries and is read for you to start running python code that uses the `fvdb` module.  `dev` mode constructs an image which is ready to build ƒVDB but does not build the ƒVDB libraries.
+
+Building the docker image in `production` mode is the default and is as simple as running the following command from the root of this repository:
+```shell
+# Build the docker image in production mode
+docker build -t fvdb/prod .
+```
+
+Building the docker mage in `dev` mode is done by setting the `BUILD_MODE` argument to `dev`:
+```shell
+# Build the docker image in dev mode
+docker build --build-arg  MODE=dev -t fvdb/dev .
+```
+
+Running the docker container is done with the following command:
+```shell
+# Run an interactive bash shell (or replace with your command)
+docker run -it --gpus all --rm \
+  fvdb/dev:latest \
+  /bin/bash
+```
+
+
+#### Setting up a Conda Environment
+
+In order to get resolved package versions in your conda environment consistent with our testing, it is necessary to configure your `.condarc` since not all package resolving behaviour can be controlled with an `environment.yml` file.  We recommend using `strict` channel priority in your conda configuration.  This can be done by running the following command:
+
+```shell
+conda config --set channel_priority strict
+```
+
+Further, it is recommend to not mix the `defaults` and `conda-forge` package channels when resolving environments.  We have generally used `conda-forge` as the primary channel for our dependencies.  You can remove the `defaults` channel and add `conda-forge` with the following command:
+
+```shell
+conda config --remove channels defaults
+conda config --add channels conda-forge
+```
+
+With these changes, it is recommended that your `.condarc` file looks like the following:
+
+```yaml
+channel_priority: strict
+channels:
+  - conda-forge
+```
+
 
 **(Optional) Install libMamba for a huge quality of life improvement when using Conda**
 ```
@@ -55,7 +110,6 @@ conda install -n base conda-libmamba-solver
 conda config --set solver libmamba
 ```
 
-### Conda Environment
 
 Next, create the `fvdb` conda environment by running the following command from the root of this repository, and then grabbing a ☕:
 ```shell
@@ -106,22 +160,6 @@ sphinx-build -E -a docs/ build/sphinx
 open build/sphinx/index.html
 ```
 
-### Docker Image
-
-To build and test *f*VDB, we have the dockerfile available:
-```shell
-# Build fvdb
-docker build . -t fvdb-dev
-# Run fvdb (or replace with your command)
-docker run -it --gpus all --rm \
-  --user $(id -u):$(id -g) \
-  --mount type=bind,source="$HOME/.ssh",target=/root/.ssh \
-  --mount type=bind,source="$(pwd)",target=/fvdb \
-  fvdb-dev:latest \
-  conda run -n fvdb_test --no-capture-output python setup.py develop
-```
-
-
 
 
 ## Usage Examples
diff --git a/fvdb/docs/conf.py b/fvdb/docs/conf.py
index b1d0ae3be3..9f15a297e0 100644
--- a/fvdb/docs/conf.py
+++ b/fvdb/docs/conf.py
@@ -9,14 +9,15 @@
 
 import os
 import sys
-sys.path.insert(0, os.path.abspath('..'))
+
+sys.path.insert(0, os.path.abspath(".."))
 
 
 # -- Project information -----------------------------------------------------
 
-project = 'fVDB'
-copyright = '2023, NVIDIA Corporation'
-author = 'NVIDIA Corporation'
+project = "fVDB"
+copyright = "2023, NVIDIA Corporation"
+author = "NVIDIA Corporation"
 
 
 # -- General configuration ---------------------------------------------------
@@ -24,12 +25,7 @@
 # Add any Sphinx extension module names here, as strings. They can be
 # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 # ones.
-extensions = [
-    'sphinx.ext.autodoc',
-    'sphinx.ext.viewcode',
-    'sphinx.ext.napoleon',
-    'myst_parser'
-]
+extensions = ["sphinx.ext.autodoc", "sphinx.ext.viewcode", "sphinx.ext.napoleon", "myst_parser"]
 
 myst_enable_extensions = [
     "amsmath",
@@ -49,28 +45,26 @@
 ]
 
 # Fix return-type in google-style docstrings
-napoleon_custom_sections = [('Returns', 'params_style')]
+napoleon_custom_sections = [("Returns", "params_style")]
 
 # Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
+templates_path = ["_templates"]
 
-source_suffix = ['.rst', '.md']
+source_suffix = [".rst", ".md"]
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
 # This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 
-autodoc_default_options = {
-    'undoc-members': 'forward, extra_repr'
-}
+autodoc_default_options = {"undoc-members": "forward, extra_repr"}
 
 # -- Options for HTML output -------------------------------------------------
 
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
 #
-html_theme = 'sphinx_rtd_theme'
+html_theme = "sphinx_rtd_theme"
 
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
@@ -80,6 +74,7 @@
 
 # -- Custom hooks ------------------------------------------------------------
 
+
 def process_signature(app, what, name, obj, options, signature, return_annotation):
     if signature is not None:
         signature = signature.replace("._Cpp", "")
@@ -91,5 +86,6 @@ def process_signature(app, what, name, obj, options, signature, return_annotatio
 
     return signature, return_annotation
 
+
 def setup(app):
     app.connect("autodoc-process-signature", process_signature)
diff --git a/fvdb/env/build_environment.yml b/fvdb/env/build_environment.yml
index f5fcff7320..19b8436c9f 100644
--- a/fvdb/env/build_environment.yml
+++ b/fvdb/env/build_environment.yml
@@ -1,7 +1,8 @@
 name: fvdb_build
 channels:
-  - nvidia/label/cuda-12.1.0
   - pytorch
+  - nvidia
+  - conda-forge
 dependencies:
   - python=3.10
   - pytorch::pytorch=2.2
@@ -11,14 +12,14 @@ dependencies:
   - ca-certificates
   - certifi
   - openssl
-  - nvidia/label/cuda-12.1.0::cuda
-  - nvidia/label/cuda-12.1.0::cuda-tools
-  - nvidia/label/cuda-12.1.0::cuda-nvcc
-  - nvidia/label/cuda-12.1.0::cuda-cccl
-  - nvidia/label/cuda-12.1.0::cuda-libraries-static
+  - cuda-toolkit=12.1
+  - cuda-compiler=12.1
+  - cuda-nvcc=12.1
+  - cuda-cccl=12.1
+  - cuda-libraries-static=12.1
   - gcc_linux-64=11
   - gxx_linux-64=11
-  - setuptools
+  - setuptools>=68.2.2
   - cmake
   - make
   - ninja
diff --git a/fvdb/env/learn_environment.yml b/fvdb/env/learn_environment.yml
index 976bb5dc3c..b84607d13c 100644
--- a/fvdb/env/learn_environment.yml
+++ b/fvdb/env/learn_environment.yml
@@ -28,6 +28,7 @@ dependencies:
   - pytest-benchmark
   - polyscope
   - numpy<2
+  - pyrender
   - pip:
     - point-cloud-utils
     - linkify-it-py
diff --git a/fvdb/env/test_environment.yml b/fvdb/env/test_environment.yml
index c3175190b8..d4e6f792d4 100644
--- a/fvdb/env/test_environment.yml
+++ b/fvdb/env/test_environment.yml
@@ -1,28 +1,29 @@
 name: fvdb_test
 channels:
   - pyg
-  - nvidia/label/cuda-12.1.0
   - pytorch
+  - nvidia
+  - conda-forge
 dependencies:
   - python=3.10
   - pytorch::pytorch=2.2
   - pytorch::pytorch-cuda=12.1
   - tensorboard
-  - pip
+  - pip>=23.3.1
   - git
   - gitpython
   - ca-certificates
   - certifi
   - openssl
-  - nvidia/label/cuda-12.1.0::cuda
-  - nvidia/label/cuda-12.1.0::cuda-tools
-  - nvidia/label/cuda-12.1.0::cuda-nvcc
-  - nvidia/label/cuda-12.1.0::cuda-cccl
-  - nvidia/label/cuda-12.1.0::cuda-libraries-static
+  - cuda-toolkit=12.1
+  - cuda-compiler=12.1
+  - cuda-nvcc=12.1
+  - cuda-cccl=12.1
+  - cuda-libraries-static=12.1
   - parameterized
   - gcc_linux-64=11
   - gxx_linux-64=11
-  - setuptools
+  - setuptools>=68.2.2
   - cmake
   - make
   - ninja
diff --git a/fvdb/examples/common.py b/fvdb/examples/common.py
new file mode 100644
index 0000000000..9eb5a22e07
--- /dev/null
+++ b/fvdb/examples/common.py
@@ -0,0 +1,190 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import hashlib
+import logging
+import timeit
+from pathlib import Path
+from typing import List, Tuple, Union
+
+import git
+import git.repo
+import numpy as np
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+from git.exc import InvalidGitRepositoryError
+
+from fvdb import GridBatch
+
+
+def _clone_fvdb_example_data():
+    def is_git_repo(repo_path: str):
+        is_repo = False
+        try:
+            _ = git.repo.Repo(repo_path)
+            is_repo = True
+        except InvalidGitRepositoryError:
+            is_repo = False
+
+        return is_repo
+
+    git_tag = "main"
+    git_url = "git@github.com:voxel-foundation/fvdb-example-data.git"
+    repo_root = Path(__file__).resolve().parent.parent
+    external_path = repo_root / "external"
+    if not external_path.exists():
+        external_path.mkdir()
+    elif not external_path.is_dir():
+        raise RuntimeError(f"External path {external_path} exists but is not a directory")
+
+    repo_path = external_path / "fvdb_example_data"
+    if repo_path.exists() and repo_path.is_dir():
+        if is_git_repo(str(repo_path)):
+            repo = git.repo.Repo(repo_path)
+            repo.git.checkout(git_tag)
+        else:
+            raise ValueError(f"A path {repo_path} exists but is not a git repo")
+    else:
+        repo = git.repo.Repo.clone_from(git_url, repo_path)
+        repo.git.checkout(git_tag)
+
+    return repo_path, repo
+
+
+def get_fvdb_example_data_path():
+    repo_path, _ = _clone_fvdb_example_data()
+    return repo_path
+
+
+def get_md5_checksum(file_path: Path):
+    md5_hash = hashlib.md5()
+    with open(file_path, "rb") as f:
+        for byte_block in iter(lambda: f.read(4096), b""):
+            md5_hash.update(byte_block)
+    return md5_hash.hexdigest()
+
+
+def make_grid_from_points(pts: torch.Tensor, padding, vox_size, vox_origin) -> GridBatch:
+    logging.info("Building GridBatch from points...")
+    start = timeit.default_timer()
+    grid = GridBatch(device=pts.device)
+    grid.set_from_points(pts, [-padding] * 3, [padding] * 3, voxel_sizes=vox_size, origins=vox_origin)
+    torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s")
+    logging.info(f"GridBatch has {grid.total_voxels} voxels")
+
+    return grid
+
+
+def make_ray_grid(
+    nrays: int,
+    origin: Union[torch.Tensor, Tuple, List],
+    minb=(-0.3, -0.3),
+    maxb=(0.3, 0.3),
+    device: Union[str, torch.device] = "cpu",
+    dtype=torch.float32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    ray_o = torch.tensor([origin] * nrays**2)
+
+    ray_d = torch.from_numpy(
+        np.stack(
+            [a.ravel() for a in np.mgrid[minb[0] : maxb[0] : nrays * 1j, minb[1] : maxb[1] : nrays * 1j]]
+            + [np.ones(nrays**2)],
+            axis=-1,
+        ).astype(np.float32)
+    )
+    ray_d /= torch.norm(ray_d, dim=-1, keepdim=True)
+
+    ray_o, ray_d = ray_o.to(device).to(dtype), ray_d.to(device).to(dtype)
+
+    return ray_o, ray_d
+
+
+def load_pointcloud(
+    data_path,
+    skip_every=1,
+    shuffle=False,
+    device=torch.device("cuda"),
+    dtype=torch.float32,
+) -> torch.Tensor:
+    logging.info(f"Loading pointlcoud {data_path}...")
+    start = timeit.default_timer()
+    pts = pcu.load_mesh_v(data_path)
+    if shuffle:
+        pts = pts[np.random.permutation(pts.shape[0])]
+    pts = pts[::skip_every]
+    logging.info(f"Done in {timeit.default_timer() - start}s")
+    return torch.from_numpy(pts).to(device).to(dtype)
+
+
+def load_mesh(
+    data_path, skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32
+) -> List[torch.Tensor]:
+    logging.info(f"Loading mesh {data_path}...")
+    start = timeit.default_timer()
+    if mode == "v":
+        attrs = [pcu.load_mesh_v(data_path)]
+    elif mode == "vf":
+        attrs = pcu.load_mesh_vf(data_path)
+    elif mode == "vn":
+        attrs = pcu.load_mesh_vn(data_path)
+    else:
+        raise ValueError(f"Unsupported mode {mode}")
+
+    attrs = [torch.from_numpy(a[::skip_every]).to(device).to(dtype) for a in attrs]
+    logging.info(f"Done in {timeit.default_timer() - start}s")
+    return attrs
+
+
+def load_dragon_mesh(skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32) -> List[torch.Tensor]:
+    data_path = get_fvdb_example_data_path() / "meshes" / "dragon.ply"
+    if get_md5_checksum(data_path) != "0222e7d2147eebcb2eacdaf6263a9512":
+        raise ValueError(f"Checksum for {data_path} is incorrect")
+    return load_mesh(data_path, mode=mode, skip_every=skip_every, device=device, dtype=dtype)
+
+
+def load_happy_mesh(skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32) -> List[torch.Tensor]:
+    data_path = get_fvdb_example_data_path() / "meshes" / "happy.ply"
+    if get_md5_checksum(data_path) != "5cfe3c9c0b58bad9a77b47ae04454160":
+        raise ValueError(f"Checksum for {data_path} is incorrect")
+    return load_mesh(data_path, mode=mode, skip_every=skip_every, device=device, dtype=dtype)
+
+
+def load_bunny_mesh(skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32) -> List[torch.Tensor]:
+    data_path = get_fvdb_example_data_path() / "meshes" / "bunny.ply"
+    if get_md5_checksum(data_path) != "fe2f062a8e22b7dab895a1945c32cd58":
+        raise ValueError(f"Checksum for {data_path} is incorrect")
+    return load_mesh(data_path, mode=mode, skip_every=skip_every, device=device, dtype=dtype)
+
+
+def load_car_1(skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32) -> List[torch.Tensor]:
+    data_path = get_fvdb_example_data_path() / "meshes" / "car-mesh-1.ply"
+    if get_md5_checksum(data_path) != "e96d59a5ee392a40442ca510c0ab8f17":
+        raise ValueError(f"Checksum for {data_path} is incorrect")
+    return load_mesh(data_path, mode=mode, skip_every=skip_every, device=device, dtype=dtype)
+
+
+def load_car_2(skip_every=1, mode="vn", device=torch.device("cuda"), dtype=torch.float32) -> List[torch.Tensor]:
+    data_path = get_fvdb_example_data_path() / "meshes" / "car-mesh-2.ply"
+    if get_md5_checksum(data_path) != "e7bcf0922518f6b43930e155a188a3a8":
+        raise ValueError(f"Checksum for {data_path} is incorrect")
+    return load_mesh(data_path, mode=mode, skip_every=skip_every, device=device, dtype=dtype)
+
+
+def plot_ray_segments(ray_o, ray_d, times, plot_every=1):
+    for i in range(0, ray_o.shape[0], plot_every):
+        t0s = times[i].jdata[:, 0].unsqueeze(-1)
+        t1s = times[i].jdata[:, 1].unsqueeze(-1)
+        roi = ray_o[i].unsqueeze(0)
+        rdi = ray_d[i].unsqueeze(0)
+        rp = torch.cat([roi + t0s * rdi, roi + t1s * rdi])
+        re = torch.stack(
+            [torch.arange(t0s.shape[0]), torch.arange(t0s.shape[0]) + t0s.shape[0]],
+            dim=-1,
+        )
+
+        ray_segs = ps.register_curve_network(f"ray segments {i}", rp, re, radius=0.001)
+        rv = torch.zeros(re.shape[0])
+        rv[::2] = 1.0
+        ray_segs.add_scalar_quantity(f"segment colors {i}", rv, defined_on="edges", enabled=True, cmap="jet")
diff --git a/fvdb/examples/compare_conv_speed.py b/fvdb/examples/compare_conv_speed.py
new file mode 100644
index 0000000000..cf2c9e44a6
--- /dev/null
+++ b/fvdb/examples/compare_conv_speed.py
@@ -0,0 +1,75 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import time
+
+import numpy as np
+import torch
+import tqdm
+from common import load_dragon_mesh
+
+from fvdb import GridBatch
+
+
+def benchmark_inplace_conv(grid: GridBatch, in_feature, in_kernel):
+    start_time = time.perf_counter()
+    out_feature = grid.sparse_conv_halo(in_feature, in_kernel)
+    torch.cuda.synchronize()
+    return time.perf_counter() - start_time
+
+
+def benchmark_kmap_conv(grid: GridBatch, in_feature, in_kernel):
+    start_time = time.perf_counter()
+    kmap, _ = grid.sparse_conv_kernel_map(kernel_size=in_kernel.size(-1), stride=1)
+    kmap.build_gather_scatter()
+    torch.cuda.synchronize()
+
+    kmap_time = time.perf_counter()
+    out_feature = kmap.sparse_conv_3d(in_feature, in_kernel)
+    torch.cuda.synchronize()
+
+    return kmap_time - start_time, time.perf_counter() - kmap_time
+
+
+def main():
+    device = torch.device("cuda")
+    dtype = torch.float32
+    kernel_size = 3
+    in_channel, out_channel = 128, 64
+
+    vox_size = 0.005
+    vox_origin = (0.0, 0.0, 0.0)
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+
+    index0 = GridBatch(device=device)
+    index0.set_from_points(p, [-1, -1, -1], [1, 1, 1], voxel_sizes=vox_size, origins=vox_origin)
+
+    grid_feats = torch.rand((index0.total_voxels, in_channel), device=device, dtype=dtype) * 0.5 + 0.5
+    kernels = (
+        torch.rand(out_channel, in_channel, kernel_size, kernel_size, kernel_size, dtype=dtype, device=device) * 0.5
+        + 0.5
+    )
+
+    torch.cuda.synchronize()
+
+    inplace_time = []
+    kmap_time = []
+    conv_time = []
+
+    for iter in tqdm.trange(100):
+        inplace = benchmark_inplace_conv(index0, grid_feats, kernels)
+        kmap, conv = benchmark_kmap_conv(index0, grid_feats, kernels)
+        inplace_time.append(inplace)
+        kmap_time.append(kmap)
+        conv_time.append(conv)
+
+    inplace_time, kmap_time, conv_time = inplace_time[5:], kmap_time[5:], conv_time[5:]
+
+    print(f"Num voxels = {index0.num_voxels}, channel = {in_channel} -> {out_channel}, device = {device}")
+    print(f"Convolution Inplace {np.mean(inplace_time):.4f} +/- {np.std(inplace_time):.4f}")
+    print(f"Kmap {np.mean(kmap_time):.4f} +/- {np.std(kmap_time):.4f}")
+    print(f"Kmap Convolution {np.mean(conv_time):.4f} +/- {np.std(conv_time):.4f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/grid_building.py b/fvdb/examples/grid_building.py
new file mode 100644
index 0000000000..6213ec6462
--- /dev/null
+++ b/fvdb/examples/grid_building.py
@@ -0,0 +1,155 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+from pathlib import Path
+
+import numpy as np
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+from common import load_car_1, load_car_2
+
+import fvdb
+from fvdb import JaggedTensor
+from fvdb.nn import VDBTensor
+
+voxel_size_1 = 0.02
+voxel_size_2 = 0.03
+
+
+def build_from_pointcloud(pcd_1: np.ndarray, pcd_2: np.ndarray):
+    # Assemble point clouds into JaggedTensor
+    pcd_jagged = JaggedTensor([torch.from_numpy(pcd_1).float().cuda(), torch.from_numpy(pcd_2).float().cuda()])
+    voxel_sizes = [[voxel_size_1, voxel_size_1, voxel_size_1], [voxel_size_2, voxel_size_2, voxel_size_2]]
+
+    # Method 1:
+    grid_a1 = fvdb.sparse_grid_from_points(pcd_jagged, voxel_sizes=voxel_sizes, origins=[0.0] * 3)
+
+    # Method 2:
+    grid_a2 = fvdb.GridBatch(device=pcd_jagged.device)
+    grid_a2.set_from_points(pcd_jagged, voxel_sizes=voxel_sizes, origins=[0.0] * 3)
+
+    # Visualization
+    gv_a1, ge_a1 = grid_a1.viz_edge_network
+    ps.remove_all_structures()
+    ps.register_point_cloud("pcd_1", pcd_1, enabled=True, radius=0.01)
+    ps.register_curve_network(
+        "grid_a1", gv_a1[0].jdata.cpu().numpy(), ge_a1[0].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.register_point_cloud("pcd_2", pcd_2, enabled=True, radius=0.01)
+    ps.register_curve_network(
+        "grid_a2", gv_a1[1].jdata.cpu().numpy(), ge_a1[1].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.show()
+
+    # Build grid from containing nearest voxels to the points
+    grid_b = fvdb.sparse_grid_from_nearest_voxels_to_points(pcd_jagged, voxel_sizes=voxel_sizes, origins=[0.0] * 3)
+
+    # Visualization
+    gv_b, ge_b = grid_b.viz_edge_network
+    ps.remove_all_structures()
+    ps.register_point_cloud("pcd_1", pcd_1, enabled=True, radius=0.01)
+    ps.register_curve_network(
+        "grid_b1", gv_b[0].jdata.cpu().numpy(), ge_b[0].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.register_point_cloud("pcd_2", pcd_2, enabled=True, radius=0.01)
+    ps.register_curve_network(
+        "grid_b2", gv_b[1].jdata.cpu().numpy(), ge_b[1].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.show()
+
+
+def build_from_coordinates(coords_1: np.ndarray, coords_2: np.ndarray):
+    coords_jagged = JaggedTensor([torch.from_numpy(coords_1).long().cuda(), torch.from_numpy(coords_2).long().cuda()])
+    voxel_sizes = [[voxel_size_1, voxel_size_1, voxel_size_1], [voxel_size_2, voxel_size_2, voxel_size_2]]
+
+    grid = fvdb.sparse_grid_from_ijk(coords_jagged, voxel_sizes=voxel_sizes, origins=[0.0] * 3)
+
+    # Visualization
+    grid_mesh_1 = pcu.voxel_grid_geometry(
+        grid.ijk[0].jdata.cpu().numpy(), grid.voxel_sizes[0].cpu().numpy(), gap_fraction=0.1
+    )
+    grid_mesh_2 = pcu.voxel_grid_geometry(
+        grid.ijk[1].jdata.cpu().numpy(), grid.voxel_sizes[1].cpu().numpy(), gap_fraction=0.1
+    )
+    ps.remove_all_structures()
+    ps.register_surface_mesh("grid_1", grid_mesh_1[0], grid_mesh_1[1], enabled=True)
+    ps.register_surface_mesh("grid_2", grid_mesh_2[0], grid_mesh_2[1], enabled=True)
+    ps.show()
+
+
+def build_from_mesh(mesh_1_vf, mesh_2_vf):
+    mesh_1_v, mesh_1_f = mesh_1_vf
+    mesh_2_v, mesh_2_f = mesh_2_vf
+
+    mesh_v_jagged = JaggedTensor([torch.from_numpy(mesh_1_v).float().cuda(), torch.from_numpy(mesh_2_v).float().cuda()])
+    mesh_f_jagged = JaggedTensor(
+        [
+            torch.from_numpy(mesh_1_f.astype(np.int64)).long().cuda(),
+            torch.from_numpy(mesh_2_f.astype(np.int64)).long().cuda(),
+        ]
+    )
+
+    voxel_sizes = [[voxel_size_1, voxel_size_1, voxel_size_1], [voxel_size_2, voxel_size_2, voxel_size_2]]
+    grid = fvdb.sparse_grid_from_mesh(mesh_v_jagged, mesh_f_jagged, voxel_sizes=voxel_sizes, origins=[0.0] * 3)
+
+    # Visualization
+    gv, ge = grid.viz_edge_network
+    ps.remove_all_structures()
+    ps.register_surface_mesh("mesh_1", mesh_1_v, mesh_1_f, enabled=True)
+    ps.register_curve_network(
+        "grid_1", gv[0].jdata.cpu().numpy(), ge[0].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.register_surface_mesh("mesh_2", mesh_2_v, mesh_2_f, enabled=True)
+    ps.register_curve_network(
+        "grid_2", gv[1].jdata.cpu().numpy(), ge[1].jdata.cpu().numpy(), enabled=True, radius=0.004
+    )
+    ps.show()
+
+
+def build_from_dense():
+    grid = fvdb.sparse_grid_from_dense(num_grids=1, dense_dims=[32, 32, 32], device="cuda")
+
+    # Easy way to initialize a VDBTensor from a torch 3D tensor [B, D, H, W, C]
+    dense_data = torch.ones(2, 32, 32, 32, 16).cuda()
+    sparse_data = VDBTensor.from_dense(dense_data, voxel_sizes=[0.1] * 3)
+    dense_data_back = sparse_data.to_dense()
+    assert torch.all(dense_data == dense_data_back)
+
+    # Visualization
+    grid_mesh = pcu.voxel_grid_geometry(
+        grid.ijk[0].jdata.cpu().numpy(), grid.voxel_sizes[0].cpu().numpy(), gap_fraction=0.1
+    )
+    ps.remove_all_structures()
+    ps.register_surface_mesh("grid_1", grid_mesh[0], grid_mesh[1], enabled=True)
+    ps.show()
+
+
+if __name__ == "__main__":
+    ps.init()
+    ps.set_ground_plane_mode("shadow_only")
+    ps.set_navigation_style("free")
+
+    base_path = Path(__file__).parent.parent
+
+    mesh_1_v, mesh_1_f = load_car_1(mode="vf", device=torch.device("cpu"))
+    mesh_2_v, mesh_2_f = load_car_2(mode="vf", device=torch.device("cpu"))
+
+    mesh_1_v, mesh_1_f = mesh_1_v.numpy(), mesh_1_f.numpy().astype(np.int64)
+    mesh_2_v, mesh_2_f = mesh_2_v.numpy(), mesh_2_f.numpy().astype(np.int64)
+
+    mesh_2_v[:, 2] += 0.8
+
+    fi1, bc1 = pcu.sample_mesh_random(mesh_1_v, mesh_1_f, 10000)
+    fi2, bc2 = pcu.sample_mesh_random(mesh_2_v, mesh_2_f, 10000)
+
+    pcd_1 = pcu.interpolate_barycentric_coords(mesh_1_f, fi1, bc1, mesh_1_v)
+    pcd_2 = pcu.interpolate_barycentric_coords(mesh_2_f, fi2, bc2, mesh_2_v)
+
+    ijk_1 = np.unique(np.floor(pcd_1 / voxel_size_1).astype(np.int64), axis=0)
+    ijk_2 = np.unique(np.floor(pcd_2 / voxel_size_2).astype(np.int64), axis=0)
+
+    build_from_pointcloud(pcd_1, pcd_2)
+    build_from_mesh((mesh_1_v, mesh_1_f), (mesh_2_v, mesh_2_f))
+    build_from_coordinates(ijk_1, ijk_2)
+    build_from_dense()
diff --git a/fvdb/examples/grid_subdivide_coarsen.py b/fvdb/examples/grid_subdivide_coarsen.py
new file mode 100644
index 0000000000..a4dec20498
--- /dev/null
+++ b/fvdb/examples/grid_subdivide_coarsen.py
@@ -0,0 +1,56 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import uuid
+
+import numpy as np
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+from common import load_dragon_mesh
+
+import fvdb
+
+
+def visualize_grid(a: fvdb.GridBatch, offset: float):
+    assert a.grid_count == 1
+    mesh_a = pcu.voxel_grid_geometry(a.ijk[0].jdata.cpu().numpy(), a.voxel_sizes[0].cpu().numpy(), gap_fraction=0.1)
+    ps.register_surface_mesh(
+        str(uuid.uuid4()),
+        mesh_a[0] + np.array([0.0, 0.0, offset]) - a.voxel_sizes[0].cpu().numpy()[None, :] / 2.0,
+        mesh_a[1],
+        enabled=True,
+    )
+
+
+if __name__ == "__main__":
+    ps.init()
+    ps.set_ground_plane_mode("shadow_only")
+    ps.set_navigation_style("free")
+
+    [p] = load_dragon_mesh(mode="v", device=torch.device("cuda"))
+
+    grid_origin = fvdb.sparse_grid_from_points(p, voxel_sizes=[0.005] * 3, origins=[0.0] * 3)
+    visualize_grid(grid_origin, 0.0)
+
+    grid_subdivided = grid_origin.subdivided_grid(2)
+    visualize_grid(grid_subdivided, 0.15)
+
+    grid_coarsened = grid_origin.coarsened_grid(2)
+    visualize_grid(grid_coarsened, 0.3)
+
+    ps.show()
+
+    grid_dual = grid_origin.dual_grid()
+
+    grid_dual_gv, grid_dual_ge = grid_dual.viz_edge_network
+    ps.remove_all_structures()
+    visualize_grid(grid_origin, 0.0)
+    ps.register_curve_network(
+        str(uuid.uuid4()),
+        grid_dual_gv[0].jdata.cpu().numpy(),
+        grid_dual_ge[0].jdata.cpu().numpy(),
+        enabled=True,
+        radius=0.004,
+    )
+    ps.show()
diff --git a/fvdb/examples/mutable_grids.py b/fvdb/examples/mutable_grids.py
new file mode 100644
index 0000000000..6af4eeafaa
--- /dev/null
+++ b/fvdb/examples/mutable_grids.py
@@ -0,0 +1,107 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+from pathlib import Path
+
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+from common import load_car_1, load_car_2
+
+import fvdb
+from fvdb import GridBatch, JaggedTensor
+
+
+def visualize_grid_color(grid: GridBatch, rgb: JaggedTensor, ignore_disabled: bool = False):
+    for b in range(grid.grid_count):
+        grid_mask = grid.enabled_mask[b].jdata.cpu().numpy()
+        if ignore_disabled:
+            grid_mask.fill(True)
+
+        grid_mesh = pcu.voxel_grid_geometry(
+            grid.ijk[b].jdata.cpu().numpy()[grid_mask], grid.voxel_sizes[b].cpu().numpy(), gap_fraction=0.1
+        )
+        grid_color = rgb[b].jdata.cpu().numpy()[grid_mask].repeat(8, axis=0).reshape(-1, 3)
+
+        ps.register_surface_mesh(f"grid_{b}", grid_mesh[0], grid_mesh[1], enabled=True).add_color_quantity(
+            "color", grid_color, enabled=True
+        )
+
+
+if __name__ == "__main__":
+    ps.init()
+    ps.set_ground_plane_mode("shadow_only")
+    ps.set_navigation_style("free")
+
+    base_path = Path(__file__).parent.parent
+
+    mesh_1_v, mesh_1_f = load_car_1(mode="vf")
+    mesh_2_v, mesh_2_f = load_car_2(mode="vf")
+
+    mesh_1_f, mesh_2_f = mesh_1_f.long(), mesh_2_f.long()
+    mesh_2_v[:, 2] += 0.8
+
+    mesh_v_jagged = JaggedTensor([mesh_1_v, mesh_2_v])
+    mesh_f_jagged = JaggedTensor([mesh_1_f, mesh_2_f])
+
+    fi1, bc1 = pcu.sample_mesh_random(mesh_1_v.cpu().numpy(), mesh_1_f.cpu().numpy(), 10000)
+    fi2, bc2 = pcu.sample_mesh_random(mesh_2_v.cpu().numpy(), mesh_2_f.cpu().numpy(), 10000)
+
+    pcd_1 = pcu.interpolate_barycentric_coords(mesh_1_f.cpu().numpy(), fi1, bc1, mesh_1_v.cpu().numpy())
+    pcd_2 = pcu.interpolate_barycentric_coords(mesh_2_f.cpu().numpy(), fi2, bc2, mesh_2_v.cpu().numpy())
+    pcd_jagged = JaggedTensor([torch.from_numpy(pcd_1).float().cuda(), torch.from_numpy(pcd_2).float().cuda()])
+
+    # Grid creation
+    grid = fvdb.sparse_grid_from_mesh(
+        mesh_v_jagged, mesh_f_jagged, voxel_sizes=[0.01] * 3, origins=[0.0] * 3, mutable=True
+    )
+    feature = grid.grid_to_world(grid.ijk.float())
+    feature.jdata = (feature.jdata - feature.jdata.min(dim=0).values) / (
+        feature.jdata.max(dim=0).values - feature.jdata.min(dim=0).values
+    )
+
+    # Visualization
+    ps.remove_all_structures()
+    visualize_grid_color(grid, feature)
+    ps.show()
+
+    # Get the IJK coordinates to be disabled
+    disable_ijk = grid.ijk.rmask(feature.jdata[:, 0] > 0.5)
+    grid.disable_ijk(disable_ijk)
+
+    # Visualize disable mask
+    enabled_mask = grid.enabled_mask
+    ps.remove_all_structures()
+    visualize_grid_color(
+        grid, feature.jagged_like(enabled_mask.jdata.unsqueeze(1).repeat(1, 3).float()), ignore_disabled=True
+    )
+    ps.show()
+
+    # Sample features onto points
+    pts_feature = grid.sample_trilinear(pcd_jagged, feature)
+
+    # Visualize (disabled grid will no longer function)
+    ps.remove_all_structures()
+    ps.register_point_cloud("pcd_1", pcd_1, enabled=True).add_color_quantity(
+        "feature", pts_feature[0].jdata.cpu().numpy(), enabled=True
+    )
+    ps.register_point_cloud("pcd_2", pcd_2, enabled=True).add_color_quantity(
+        "feature", pts_feature[1].jdata.cpu().numpy(), enabled=True
+    )
+    ps.show()
+
+    # We could enable those IJK back
+    grid.enable_ijk(disable_ijk)
+
+    # Sample features onto points
+    pts_feature = grid.sample_trilinear(pcd_jagged, feature)
+
+    # Visualize (this time we got the original features back)
+    ps.remove_all_structures()
+    ps.register_point_cloud("pcd_1", pcd_1, enabled=True).add_color_quantity(
+        "feature", pts_feature[0].jdata.cpu().numpy(), enabled=True
+    )
+    ps.register_point_cloud("pcd_2", pcd_2, enabled=True).add_color_quantity(
+        "feature", pts_feature[1].jdata.cpu().numpy(), enabled=True
+    )
+    ps.show()
diff --git a/fvdb/examples/overfit_sdf.py b/fvdb/examples/overfit_sdf.py
new file mode 100644
index 0000000000..69be3eb49d
--- /dev/null
+++ b/fvdb/examples/overfit_sdf.py
@@ -0,0 +1,116 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import os
+import logging
+
+import numpy as np
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+import tqdm
+from common import load_happy_mesh
+
+from fvdb import GridBatch
+
+
+def prepare_sdf(npts, ng):
+    logging.info("Loading data...")
+    v, f = load_happy_mesh(mode="vf", device=torch.device("cpu"))
+    v -= v.amin(0)
+    v /= v.amax()
+    v -= 0.5 * v.amax()
+    v = v.numpy()
+    f = f.type(torch.int32).numpy()
+
+    n = pcu.estimate_mesh_vertex_normals(v, f)
+    fid, bc = pcu.sample_mesh_poisson_disk(v, f, npts)
+    pts = pcu.interpolate_barycentric_coords(f, fid, bc, v)
+    nms = pcu.interpolate_barycentric_coords(f, fid, bc, n)
+    logging.info("Done")
+
+    logging.info("Generating grid samples")
+    gpts = np.stack(
+        [
+            a.ravel()
+            for a in np.mgrid[
+                v.min(0)[0] * 1.05 : v.max(0)[0] * 1.05 : ng * 1j,
+                v.min(0)[1] * 1.05 : v.max(0)[1] * 1.05 : ng * 1j,
+                v.min(0)[2] * 1.05 : v.max(0)[2] * 1.05 : ng * 1j,
+            ]
+        ],
+        axis=-1,
+    ).astype(pts.dtype)
+    logging.info("Done")
+
+    logging.info("Computing SDF")
+    sdf, _, _ = pcu.signed_distance_to_mesh(gpts, v, f)
+    logging.info("Done")
+
+    return pts, nms, gpts, sdf
+
+
+def main():
+    torch.random.manual_seed(5)
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+    device = torch.device("cuda")
+    dtype = torch.float32
+    vox_size = 0.005
+    vox_origin = np.zeros(3)
+    vox_pad = 1
+    ng = 256
+    npts = 10_000
+    num_iters = 750
+
+    # Cache the sdf data so we don't have to recompute it every single time
+    if not os.path.exists("sdf.npz"):
+        pts, nms, gpts, sdf = prepare_sdf(npts, ng)
+        np.savez("sdf.npz", pts=pts, nms=nms, gpts=gpts, sdf=sdf)
+    else:
+        dat = np.load("sdf.npz")
+        pts, nms, gpts, sdf = dat["pts"], dat["nms"], dat["gpts"], dat["sdf"]
+
+    p, n = torch.from_numpy(pts).to(device).to(dtype), torch.from_numpy(nms).to(device).to(dtype)
+
+    grid = GridBatch(device=device)
+
+    grid.set_from_points(p, [-vox_pad] * 3, [vox_pad] * 3, vox_size, vox_origin)
+    dual_index = grid.dual_grid()
+
+    mask = grid.points_in_active_voxel(torch.from_numpy(gpts).to(dtype).to(device)).jdata.cpu().numpy()
+    vol_pts = torch.from_numpy(gpts[mask]).to(device=device, dtype=dtype)
+    vol_sdf = torch.from_numpy(sdf[mask]).to(device=device, dtype=dtype).unsqueeze(-1)
+
+    features = torch.randn(dual_index.total_voxels, 1).to(device).to(dtype)
+    features.requires_grad = True
+
+    optimizer = torch.optim.Adam([features], lr=1e-2)
+
+    # This should converge to around 2e-8 loss
+    pbar = tqdm.tqdm(range(num_iters))
+    for _ in pbar:
+        optimizer.zero_grad()
+        vp_idx = torch.randperm(vol_pts.shape[0])
+        vpts = vol_pts[vp_idx]
+        vsdf = vol_sdf[vp_idx]
+
+        samp_sdf = dual_index.sample_trilinear(vpts, features).jdata
+
+        loss = torch.nn.functional.mse_loss(samp_sdf, vsdf)
+        loss.backward()
+        pbar.set_postfix({"Loss": loss.item()})
+        optimizer.step()
+
+    ps.init()
+    pred_sdf = dual_index.sample_trilinear(vol_pts, features).jdata
+    assert isinstance(pred_sdf, torch.Tensor)
+    vol_pc = ps.register_point_cloud("pts", vol_pts.cpu().numpy())
+    vol_pc.add_scalar_quantity("sdf_pred", pred_sdf.squeeze().detach().cpu().numpy())
+    vol_pc.add_scalar_quantity("sdf_gt", vol_sdf.squeeze().detach().cpu().numpy())
+    vol_pc.add_scalar_quantity("delta", (vol_sdf - pred_sdf).squeeze().abs().detach().cpu().numpy())
+    ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/ray_segment_marching.py b/fvdb/examples/ray_segment_marching.py
new file mode 100644
index 0000000000..bfc9f5ae5c
--- /dev/null
+++ b/fvdb/examples/ray_segment_marching.py
@@ -0,0 +1,102 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import logging
+import timeit
+
+from fvdb import GridBatch, JaggedTensor
+import torch
+import polyscope as ps
+
+from common import load_dragon_mesh, make_ray_grid
+
+
+def main():
+    torch.random.manual_seed(5)
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+    vox_size = 0.04
+    vox_origin = torch.zeros(3).to(device)
+
+    N = 10  # Maximum number of segments to intersect along ray
+    nrays = 100
+    plot_every = 20
+    batch_size = 2
+
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+    p -= p.mean(0)
+    p /= 10.0
+    p = torch.concatenate(
+        [
+            p,
+            p + 2 * torch.tensor([0, 0, 0.48], device=p.device),
+            p + 2 * torch.tensor([0, 0, 0.96], device=p.device),
+            # p + 1 * torch.tensor([0, 0, 1.44], device=p.device),
+        ]
+    )
+    n = torch.concatenate([n, n, n])
+
+    ray_o, ray_d = make_ray_grid(nrays, [0.0, 0.1, -0.1], device=device, dtype=dtype)
+    pmt = torch.randperm(ray_o.shape[0]).to(device)
+    ray_o, ray_d = ray_o[pmt], ray_d[pmt]
+
+    p, n = JaggedTensor([p] * batch_size), JaggedTensor([n] * batch_size)
+    ray_o, ray_d = JaggedTensor([ray_o] * batch_size), JaggedTensor([ray_d] * batch_size)
+
+    grid = GridBatch(device=device)
+    grid.set_from_points(p, [-1] * 3, [1] * 3, voxel_sizes=vox_size, origins=vox_origin)
+
+    gc, ge = grid.viz_edge_network
+
+    logging.info(f"Tracing {nrays ** 2} Ray Segments...")
+    start = timeit.default_timer()
+    segments = grid.segments_along_rays(ray_o, ray_d, N, eps=1e-5)
+    if p.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s!")
+
+    ps.init()
+    ps.set_ground_plane_mode("shadow_only")
+
+    for b_i in range(batch_size):
+        ps.register_point_cloud("points", p[b_i].jdata.cpu(), radius=0.00025, point_render_mode="quad")
+        for i in range(0, len(ray_o[b_i].jdata), plot_every):
+            roi = ray_o[b_i].jdata[i].unsqueeze(0)  # [1, 3]
+            rdi = ray_d[b_i].jdata[i].unsqueeze(0)  # [1, 3]
+            segsi = segments[b_i][i].jdata  # [N, 2]
+
+            if segsi.numel() == 0:
+                continue
+
+            rp = torch.cat(
+                [
+                    roi + segsi[:, 0].unsqueeze(-1) * rdi,
+                    roi + segsi[:, 1].unsqueeze(-1) * rdi,
+                ]
+            )
+            re = torch.stack([torch.arange(segsi.shape[0]), torch.arange(segsi.shape[0]) + segsi.shape[0]], dim=-1)
+
+            ray_segs = ps.register_curve_network(f"ray segments {i}", rp.cpu(), re.cpu(), radius=0.00175)
+            rv = torch.zeros(re.shape[0])
+            rv[::2] = 1.0
+            ray_segs.add_scalar_quantity(f"segment colors {i}", rv.cpu(), defined_on="edges", enabled=True, cmap="jet")
+
+        ps.register_point_cloud("grid corners", gc.jdata.cpu(), enabled=True, radius=0.00025, point_render_mode="quad")
+        ps.register_curve_network(
+            "grid edges", gc.jdata.cpu(), ge.jdata.cpu(), enabled=True, radius=0.00025, transparency=0.7
+        )
+
+        # ray_dir_points = torch.cat([ray_o, ray_o + 0.5 * ray_d])
+        # ray_dir_edges = torch.stack([torch.arange(ray_o.shape[0]), torch.arange(ray_o.shape[0]) + ray_o.shape[0]], dim=-1)
+        # ps.register_curve_network("ray directions", ray_dir_points, ray_dir_edges, radius=0.0005)
+        # ps.register_point_cloud("ray origins", ray_o, radius=0.01)
+        ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/ray_voxel_marching.py b/fvdb/examples/ray_voxel_marching.py
new file mode 100644
index 0000000000..a5ed3ee908
--- /dev/null
+++ b/fvdb/examples/ray_voxel_marching.py
@@ -0,0 +1,100 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import timeit
+import logging
+
+import polyscope as ps
+import torch
+
+import fvdb
+from fvdb import JaggedTensor, GridBatch
+
+from common import load_dragon_mesh, make_ray_grid, plot_ray_segments
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+    device = torch.device("cuda")
+    dtype = torch.float32
+    vox_size = 0.04
+    vox_origin = [0.0, 0.0, 0.0]
+
+    nrays = 1024  # 100 x 100 rays
+    plot_every = 512  # only plot every n rays
+    max_voxels = 20  # maximum number of voxels to intersect along ray
+
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+    p -= p.mean(0)
+    p /= 10.0
+    p = torch.concatenate(
+        [
+            p,
+            p + 2 * torch.tensor([0, 0, 0.48], device=p.device),
+            p + 2 * torch.tensor([0, 0, 0.96], device=p.device),
+            # p + 1 * torch.tensor([0, 0, 1.44], device=p.device),
+        ]
+    )
+    n = torch.concatenate([n, n, n])
+
+    batch_size = 2
+
+    p = fvdb.JaggedTensor([p] * batch_size)
+    n = fvdb.JaggedTensor([n] * batch_size)
+
+    grid = GridBatch(device=device)
+    grid.set_from_points(p, [-1] * 3, [1] * 3, voxel_sizes=vox_size, origins=vox_origin)
+
+    logging.info(f"Created {len(grid)} grids with {grid.total_voxels} total voxels")
+    gc, ge = grid.viz_edge_network
+
+    ray_o, ray_d = make_ray_grid(nrays, [0.0, 0.0, -0.1], device=device, dtype=dtype)
+    pmt = torch.randperm(ray_o.shape[0]).to(device)
+    ray_o, ray_d = ray_o[pmt], ray_d[pmt]
+
+    ray_o, ray_d = fvdb.JaggedTensor([ray_o] * batch_size), fvdb.JaggedTensor([ray_d] * batch_size)
+
+    logging.info(f"Tracing {nrays ** 2} Rays Per Grid...")
+    start = timeit.default_timer()
+    vox, times = grid.voxels_along_rays(ray_o, ray_d, max_voxels, 1e-4)
+    if p.jdata.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s")
+
+    logging.info(f"There are {len(vox)} sets of intersections in the batch")
+    for i, visect in enumerate(vox):
+        logging.info(f"There are {len(visect)} rays in the {i}th set of intersections")
+
+    logging.info("Plotting")
+    ps.init()
+    for i in range(batch_size):
+        p_i = p[i].jdata.cpu()
+        ray_o_i, ray_d_i = ray_o[i].jdata.cpu(), ray_d[i].jdata.cpu()
+        times_i = times[i].cpu()
+        gc_i, ge_i = gc[i].cpu(), ge[i].cpu()
+
+        ps.set_ground_plane_mode("shadow_only")
+
+        ps.register_point_cloud("points", p_i, radius=0.00025)
+        logging.info("About to plot ray segments")
+        plot_ray_segments(ray_o_i, ray_d_i, times_i, plot_every)
+        logging.info("Plotted Ray Segments")
+
+        logging.info(f"Creating a new grid of only the voxels intersected by this ray")
+        isected_grid = fvdb.sparse_grid_from_ijk(vox[i].jflatten(), voxel_sizes=vox_size, origins=vox_origin)
+        logging.info(f"Created {len(isected_grid)} grids with {isected_grid.total_voxels} total voxels")
+        iv, ie = isected_grid.viz_edge_network
+        ps.register_curve_network("intersected voxels", iv.jdata.cpu(), ie.jdata.cpu(), enabled=True, radius=0.0009)
+        ps.register_point_cloud("grid corners", gc_i.jdata, enabled=True, radius=0.001)
+        ps.register_curve_network("grid edges", gc_i.jdata, ge_i.jdata, enabled=True, radius=0.00015, transparency=0.7)
+
+        # ray_dir_points = torch.cat([ray_o_i, ray_o_i + times_i.jdata.max() * ray_d_i])
+        # ray_dir_edges = torch.stack([torch.arange(ray_o_i.shape[0]), torch.arange(ray_o_i.shape[0]) + ray_o_i.shape[0]], dim=-1)
+        # ps.register_curve_network("ray directions", ray_dir_points, ray_dir_edges, radius=0.0005)
+        # ps.register_point_cloud("ray origins", ray_o, radius=0.01)
+        ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/sample_trilinear.py b/fvdb/examples/sample_trilinear.py
new file mode 100644
index 0000000000..8711691c30
--- /dev/null
+++ b/fvdb/examples/sample_trilinear.py
@@ -0,0 +1,67 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import timeit
+import logging
+
+import polyscope as ps
+import torch
+from fvdb import GridBatch
+
+from common import load_dragon_mesh
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+    device = torch.device("cuda")
+    dtype = torch.float32
+    vox_size = 0.0025
+    vox_origin = torch.zeros(3)
+
+    p, n = load_dragon_mesh(skip_every=1, device=device, dtype=dtype)
+
+    index = GridBatch(device=device)
+    index.set_from_points(p, voxel_sizes=vox_size, origins=vox_origin)
+    index_dual = index.dual_grid()
+
+    nsplat = index.splat_trilinear(p, n)
+    gp = index.ijk
+    gd = index_dual.ijk
+    gp = index.grid_to_world(gp.type(dtype))
+    gd = index_dual.grid_to_world(gd.type(dtype))
+
+    features = torch.ones(index_dual.total_voxels, 32).to(device).to(dtype) * torch.norm(
+        gd.jdata.type(dtype), dim=-1, keepdim=True
+    )
+    features.requires_grad = True
+
+    logging.info("Sampling features....")
+    start = timeit.default_timer()
+    features_trilerp = index_dual.sample_trilinear(p, features)
+    if features.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s!")
+    loss = features_trilerp.jdata.sum()
+    loss.backward()
+
+    p, n = p.cpu(), n.cpu()
+    nsplat = nsplat.cpu()
+    gp, gd = gp.cpu(), gd.cpu()
+    features = features.detach().cpu()
+    features_trilerp = features_trilerp.detach().cpu()
+
+    ps.init()
+    dual_grid_pts = ps.register_point_cloud("dual grid corners", gd.jdata, radius=0.001)
+    dual_grid_pts.add_scalar_quantity("feature norms", torch.norm(features, dim=-1), enabled=True)
+
+    primal_grid_pts = ps.register_point_cloud("primal grid corners", gp.jdata, radius=0.0005)
+    primal_grid_pts.add_vector_quantity("splatted normals", nsplat.jdata, enabled=True, length=0.05, radius=0.001)
+
+    surf_pts = ps.register_point_cloud("points", p, radius=0.0035)
+    surf_pts.add_scalar_quantity("sampled feature norms", torch.norm(features_trilerp.jdata, dim=-1), enabled=True)
+    ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/splat_trilinear.py b/fvdb/examples/splat_trilinear.py
new file mode 100644
index 0000000000..415fe338c4
--- /dev/null
+++ b/fvdb/examples/splat_trilinear.py
@@ -0,0 +1,54 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import timeit
+
+import logging
+import polyscope as ps
+import torch
+from fvdb import GridBatch
+
+from common import load_dragon_mesh
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    vox_size = 0.0025
+    vox_origin = (0, 0, 0)
+
+    p, n = load_dragon_mesh(skip_every=1, device=device, dtype=dtype)
+
+    index = GridBatch(device=device)
+    index.set_from_points(p, voxel_sizes=vox_size, origins=vox_origin)
+    index_dual = index.dual_grid()
+
+    logging.info("Splatting into grid...")
+    start = timeit.default_timer()
+    nsplat = index.splat_trilinear(p, n)
+    if p.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s!")
+
+    gp = index.ijk
+    gd = index_dual.ijk
+    gp = index.grid_to_world(gp.type(dtype))
+    gd = index_dual.grid_to_world(gd.type(dtype))
+
+    p, n = p.cpu(), n.cpu()
+    nsplat = nsplat.cpu()
+    gp, gd = gp.cpu(), gd.cpu()
+
+    ps.init()
+    ps.register_point_cloud("points", p, radius=0.00075)
+    grid_pts = ps.register_point_cloud("vox coords", gp.jdata, radius=0.0005)
+
+    grid_pts.add_vector_quantity("splatted normals", nsplat.jdata, enabled=True, length=0.05, radius=0.001)
+    ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/subdivide.py b/fvdb/examples/subdivide.py
new file mode 100644
index 0000000000..541a768f46
--- /dev/null
+++ b/fvdb/examples/subdivide.py
@@ -0,0 +1,74 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import time
+import logging
+
+import polyscope as ps
+import torch
+from fvdb import GridBatch
+
+from common import load_dragon_mesh
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+    device = torch.device("cuda")
+    dtype = torch.float32
+
+    vox_size = 0.01
+    vox_origin = (0.0, 0.0, 0.0)
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+
+    index0 = GridBatch(device)
+    index0.set_from_points(p, [-1, -1, -1], [1, 1, 1], vox_size, vox_origin)
+    grids = [index0]
+
+    logging.info("Splatting into grid...")
+    start = time.time()
+    nsplat = index0.splat_trilinear(p, n)
+    if device == "cuda":
+        torch.cuda.synchronize()
+    logging.info(f"Done in {time.time() - start}s!")
+
+    logging.info("Building subdivided grids")
+    start = time.time()
+    for i in range(2):
+        subdiv_factor = i + 1
+        mask = torch.rand(grids[i].total_voxels, device=device) > 0.5
+        grids.append(grids[-1].subdivided_grid(subdiv_factor, mask))
+        assert mask.sum().item() * subdiv_factor**3 == grids[-1].total_voxels
+    if device == "cuda":
+        torch.cuda.synchronize()
+    logging.info(f"Done in {time.time() - start}s!")
+
+    p, n = p.cpu(), n.cpu()
+
+    ps.init()
+    ps.register_point_cloud("points", p, radius=0.00075)
+
+    for i, index in enumerate(grids):
+        dual_index = index.dual_grid()
+        gp = index.ijk.jdata
+        gd = dual_index.ijk.jdata
+        dual_v, dual_e = index.viz_edge_network
+
+        dual_v = dual_v.jdata.cpu()
+        dual_e = dual_e.jdata.cpu()
+        gp = index.grid_to_world(gp.to(dtype)).cpu()
+        gd = dual_index.grid_to_world(gd.to(dtype)).cpu()
+        gp, gd = gp.cpu().jdata, gd.cpu().jdata
+
+        ps.register_curve_network(f"grid edges {i}", dual_v.cpu(), dual_e.cpu(), enabled=True, radius=0.0005)
+        ps.register_point_cloud(f"vox corners {i}", gd, radius=0.0005 * (i + 1))
+        if i == 0:
+            grid_pts = ps.register_point_cloud("vox centers", gp, radius=0.0005)
+            grid_pts.add_vector_quantity(
+                "splatted normals", nsplat.jdata.cpu(), enabled=True, length=0.05, radius=0.001
+            )
+    ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/uniform_ray_marching.py b/fvdb/examples/uniform_ray_marching.py
new file mode 100644
index 0000000000..d6176d7a46
--- /dev/null
+++ b/fvdb/examples/uniform_ray_marching.py
@@ -0,0 +1,124 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import os
+import time
+import logging
+import timeit
+
+import numpy as np
+import point_cloud_utils as pcu
+import polyscope as ps
+import torch
+from fvdb import GridBatch, JaggedTensor
+import fvdb
+
+from common import load_dragon_mesh, make_ray_grid, plot_ray_segments
+
+
+def main():
+    torch.random.manual_seed(5)
+    logging.basicConfig(level=logging.INFO)
+    logging.addLevelName(logging.INFO, "\033[1;32m%s\033[1;0m" % logging.getLevelName(logging.INFO))
+
+    device = torch.device("cuda")
+    dtype = torch.float32
+    vox_size = 0.04
+    vox_origin = torch.zeros(3).to(device)
+    step_size = 0.5 * vox_size
+
+    N = 10  # Maximum number of segments to intersect along ray
+    nrays = 100
+    plot_every = 20
+    batch_size = 2
+
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+
+    p, n = load_dragon_mesh(device=device, dtype=dtype)
+    p -= p.mean(0)
+    p /= 10.0
+    p = torch.concatenate(
+        [
+            p,
+            p + 2 * torch.tensor([0, 0, 0.48], device=p.device),
+            p + 2 * torch.tensor([0, 0, 0.96], device=p.device),
+            # p + 1 * torch.tensor([0, 0, 1.44], device=p.device),
+        ]
+    )
+    n = torch.concatenate([n, n, n])
+
+    ray_o, ray_d = make_ray_grid(nrays, [0.0, 0.1, -0.1], device=device, dtype=dtype)
+    pmt = torch.randperm(ray_o.shape[0]).to(device)
+    ray_o, ray_d = ray_o[pmt], ray_d[pmt]
+
+    p, n = JaggedTensor([p] * batch_size), JaggedTensor([n] * batch_size)
+    ray_o, ray_d = JaggedTensor([ray_o] * batch_size), JaggedTensor([ray_d] * batch_size)
+
+    grid = GridBatch(device=device)
+    grid.set_from_points(p, [-1] * 3, [1] * 3, voxel_sizes=vox_size, origins=vox_origin)
+
+    gc, ge = grid.viz_edge_network
+
+    logging.info(f"Tracing {nrays ** 2} Ray Segments...")
+    start = timeit.default_timer()
+    segments = grid.segments_along_rays(ray_o, ray_d, N, eps=1e-5)
+    if p.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s!")
+
+    tmin = fvdb.jzeros(ray_o.lshape, device=device, dtype=dtype)
+    tmax = fvdb.jones(ray_o.lshape, device=device, dtype=dtype) * 1e10
+
+    logging.info(f"Generating samples for {ray_o.rshape[0]} Ray Segments...")
+    start = timeit.default_timer()
+    ray_ts = grid.uniform_ray_samples(ray_o, ray_d, tmin, tmax, step_size, eps=1e-4)
+    if p.is_cuda:
+        torch.cuda.synchronize()
+    logging.info(f"Done in {timeit.default_timer() - start}s!")
+
+    print(ray_ts.eshape)
+    ps.init()
+    ps.set_ground_plane_mode("shadow_only")
+
+    for b_i in range(batch_size):
+        ps.register_point_cloud("points", p[b_i].jdata.cpu(), radius=0.00025, point_render_mode="quad")
+        for i in range(0, len(ray_o[b_i].jdata), plot_every):
+            roi = ray_o[b_i].jdata[i].unsqueeze(0)  # [1, 3]
+            rdi = ray_d[b_i].jdata[i].unsqueeze(0)  # [1, 3]
+            segsi = segments[b_i][i].jdata  # [N, 2]
+
+            if segsi.numel() == 0:
+                continue
+
+            rp = torch.cat(
+                [
+                    roi + segsi[:, 0].unsqueeze(-1) * rdi,
+                    roi + segsi[:, 1].unsqueeze(-1) * rdi,
+                ]
+            )
+            re = torch.stack([torch.arange(segsi.shape[0]), torch.arange(segsi.shape[0]) + segsi.shape[0]], dim=-1)
+
+            # ray_segs = ps.register_curve_network(f"ray segments {i}", rp.cpu(), re.cpu(), radius=0.00075)
+
+            ray_ts_i = ray_ts[b_i][i].jdata
+            ray_ts_i = 0.5 * (ray_ts_i[:, 0] + ray_ts_i[:, 1])
+            ray_samples = roi + ray_ts_i.unsqueeze(-1) * rdi
+            ps.register_point_cloud(f"ray samples {i}", ray_samples.cpu(), radius=0.0015)
+            # rv = torch.zeros(re.shape[0])
+            # rv[::2] = 1.0
+            # ray_segs.add_scalar_quantity(f"segment colors {i}", rv.cpu(), defined_on="edges", enabled=True, cmap="jet")
+
+        ps.register_point_cloud("grid corners", gc.jdata.cpu(), enabled=True, radius=0.00025, point_render_mode="quad")
+        ps.register_curve_network(
+            "grid edges", gc.jdata.cpu(), ge.jdata.cpu(), enabled=True, radius=0.00025, transparency=0.7
+        )
+
+        # ray_dir_points = torch.cat([ray_o, ray_o + 0.5 * ray_d])
+        # ray_dir_edges = torch.stack([torch.arange(ray_o.shape[0]), torch.arange(ray_o.shape[0]) + ray_o.shape[0]], dim=-1)
+        # ps.register_curve_network("ray directions", ray_dir_points, ray_dir_edges, radius=0.0005)
+        # ps.register_point_cloud("ray origins", ray_o, radius=0.01)
+        ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/examples/voxel_neighborhood.py b/fvdb/examples/voxel_neighborhood.py
new file mode 100644
index 0000000000..c69576768f
--- /dev/null
+++ b/fvdb/examples/voxel_neighborhood.py
@@ -0,0 +1,49 @@
+# Copyright Contributors to the OpenVDB Project
+# SPDX-License-Identifier: MPL-2.0
+#
+import numpy as np
+import polyscope as ps
+import torch
+from common import load_dragon_mesh
+
+from fvdb import GridBatch, sparse_grid_from_ijk
+
+
+def main():
+    device = "cuda"
+
+    vox_size = 0.0075
+    vox_origin = (0, 0, 0)
+    N = 1
+
+    [p] = load_dragon_mesh(mode="v", skip_every=N, device=torch.device(device))
+
+    index = GridBatch(device=device)
+    index.set_from_points(p, [-1, -1, -1], [1, 1, 1], vox_size, vox_origin)
+
+    primal_voxels = index.ijk.jdata
+
+    nhood = index.neighbor_indexes(primal_voxels, 1, 0).jdata
+
+    ps.init()
+    for _ in range(10):
+        randvox = np.random.randint(nhood.shape[0])
+
+        voxijk = primal_voxels[randvox]
+        nbrs = primal_voxels[nhood[randvox][nhood[randvox] >= 0]]
+        print(nhood[randvox])
+        nhood_ijk = torch.cat([voxijk.unsqueeze(0), nbrs], dim=0)
+
+        vp, ve = index.viz_edge_network
+        vp, ve = vp.jdata, ve.jdata
+
+        vi, vei = sparse_grid_from_ijk(nhood_ijk, voxel_sizes=vox_size, origins=vox_origin).viz_edge_network
+        vi, vei = vi.jdata, vei.jdata
+
+        ps.register_curve_network("vox", vp.cpu().numpy(), ve.cpu().numpy(), radius=0.0025)
+        ps.register_curve_network("nhd", vi.cpu().numpy(), vei.cpu().numpy(), radius=0.005)
+        ps.show()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/fvdb/fvdb/_Cpp.pyi b/fvdb/fvdb/_Cpp.pyi
index 30d2d9fea5..c262a47c5b 100644
--- a/fvdb/fvdb/_Cpp.pyi
+++ b/fvdb/fvdb/_Cpp.pyi
@@ -6,16 +6,31 @@ import numpy
 import torch
 from enum import Enum
 
-
 Numeric = Union[int, float]
 TorchDeviceOrString = Union[torch.device, str]
-Vec3iBatch = Union[torch.Tensor, numpy.ndarray, List[int], List[List[int]],
-                   Tuple[int, int, int], List[Tuple[int, int, int]]]
-Vec3dBatch = Union[torch.Tensor, numpy.ndarray, List[float], List[List[float]],
-                   Tuple[float, float, float], List[Tuple[float, float, float]], Vec3iBatch]
-Vec3dBatchOrScalar = Union[torch.Tensor, numpy.ndarray, List[float], List[List[float]],
-                           Tuple[float, float, float], List[Tuple[float, float, float]],
-                           float, Vec3iBatch, int]
+Vec3iBatch = Union[
+    torch.Tensor, numpy.ndarray, List[int], List[List[int]], Tuple[int, int, int], List[Tuple[int, int, int]]
+]
+Vec3dBatch = Union[
+    torch.Tensor,
+    numpy.ndarray,
+    List[float],
+    List[List[float]],
+    Tuple[float, float, float],
+    List[Tuple[float, float, float]],
+    Vec3iBatch,
+]
+Vec3dBatchOrScalar = Union[
+    torch.Tensor,
+    numpy.ndarray,
+    List[float],
+    List[List[float]],
+    Tuple[float, float, float],
+    List[Tuple[float, float, float]],
+    float,
+    Vec3iBatch,
+    int,
+]
 
 Vec3i = Union[torch.Tensor, numpy.ndarray, List[int], Tuple[int, int, int]]
 Vec3d = Union[torch.Tensor, numpy.ndarray, List[float], Tuple[float, float, float]]
@@ -46,7 +61,6 @@ class JaggedTensor:
     def type(self, arg0: torch.dtype) -> JaggedTensor: ...
     def to(self, device: TorchDeviceOrString | torch.dtype) -> JaggedTensor: ...
     def rmask(self, mask: torch.Tensor) -> JaggedTensor: ...
-
     def __add__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __sub__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __mul__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
@@ -54,7 +68,6 @@ class JaggedTensor:
     def __truediv__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __floordiv__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __mod__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
-
     def __iadd__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __isub__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __imul__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
@@ -62,32 +75,26 @@ class JaggedTensor:
     def __itruediv__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __ifloordiv__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __imod__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
-
     def __gt__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __ge__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __lt__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __le__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __eq__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
     def __ne__(self, other: Union[int, float, JaggedTensor]) -> JaggedTensor: ...
-
     def __getitem__(self, idx: Index | JaggedTensor) -> JaggedTensor: ...
     def __iter__(self) -> Iterator[JaggedTensor]: ...
     def __len__(self) -> int: ...
-
     def __getstate__(self) -> tuple: ...
     def __setstate__(self, arg0: tuple) -> None: ...
-
     def detach(self) -> JaggedTensor: ...
     def requires_grad_(self, requires_grad: bool) -> JaggedTensor: ...
     def jagged_like(self, data: torch.Tensor) -> JaggedTensor: ...
     def clone(self) -> JaggedTensor: ...
-
     def sqrt(self) -> JaggedTensor: ...
     def abs(self) -> JaggedTensor: ...
     def round(self, decimals: int = ...) -> JaggedTensor: ...
     def floor(self) -> JaggedTensor: ...
     def ceil(self) -> JaggedTensor: ...
-
     def sqrt_(self) -> JaggedTensor: ...
     def abs_(self) -> JaggedTensor: ...
     def round_(self, decimals: int = ...) -> JaggedTensor: ...
@@ -95,18 +102,13 @@ class JaggedTensor:
     def ceil_(self) -> JaggedTensor: ...
 
     # def jagged_argsort(self) -> JaggedTensor: ...
-    def jsum(self, dim : int = 0, keepdim : bool = False) -> JaggedTensor: ...
-    def jmin(self, dim : int = 0, keepdim : bool = False) -> Tuple[JaggedTensor, JaggedTensor]: ...
-    def jmax(self, dim : int = 0, keepdim : bool = False) -> Tuple[JaggedTensor, JaggedTensor]: ...
-
+    def jsum(self, dim: int = 0, keepdim: bool = False) -> JaggedTensor: ...
+    def jmin(self, dim: int = 0, keepdim: bool = False) -> Tuple[JaggedTensor, JaggedTensor]: ...
+    def jmax(self, dim: int = 0, keepdim: bool = False) -> Tuple[JaggedTensor, JaggedTensor]: ...
     def jreshape(self, lshape: LShapeSpec) -> JaggedTensor: ...
-
     def jreshape_as(self, other: JaggedTensor) -> JaggedTensor: ...
-
     def jflatten(self, dim: int = 0) -> JaggedTensor: ...
-
     def unbind(self) -> Union[List[torch.Tensor], List[List[torch.Tensor]]]: ...
-
     @property
     def num_tensors(self) -> int: ...
     @property
@@ -135,19 +137,18 @@ class JaggedTensor:
     def edim(self) -> int: ...
     @property
     def requires_grad(self) -> bool: ...
-
     @staticmethod
     def from_data_and_indices(data: torch.Tensor, indices: torch.Tensor, num_tensors: int) -> JaggedTensor: ...
-
     @staticmethod
-    def from_data_indices_and_list_ids(data: torch.Tensor, indices: torch.Tensor, list_ids: torch.Tensor, num_tensors: int) -> JaggedTensor: ...
-
+    def from_data_indices_and_list_ids(
+        data: torch.Tensor, indices: torch.Tensor, list_ids: torch.Tensor, num_tensors: int
+    ) -> JaggedTensor: ...
     @staticmethod
     def from_data_and_offsets(data: torch.Tensor, offsets: torch.Tensor) -> JaggedTensor: ...
-
     @staticmethod
-    def from_data_offsets_and_list_ids(data: torch.Tensor, offsets: torch.Tensor, list_ids: torch.Tensor) -> JaggedTensor: ...
-
+    def from_data_offsets_and_list_ids(
+        data: torch.Tensor, offsets: torch.Tensor, list_ids: torch.Tensor
+    ) -> JaggedTensor: ...
 
 JaggedTensorOrTensor = Union[torch.Tensor, JaggedTensor]
 
@@ -212,7 +213,6 @@ class GridBatch:
     def total_bbox(self) -> torch.IntTensor: ...
     @property
     def address(self) -> int: ...
-
     def voxel_size_at(self, bi: int) -> torch.FloatTensor: ...
     def origin_at(self, bi: int) -> torch.FloatTensor: ...
     def num_voxels_at(self, bi: int) -> int: ...
@@ -221,71 +221,157 @@ class GridBatch:
     def cum_enabled_voxels_at(self, bi: int) -> int: ...
     def bbox_at(self, bi: int) -> torch.IntTensor: ...
     def dual_bbox_at(self, bi: int) -> torch.IntTensor: ...
-
     def jagged_like(self, data: torch.Tensor, ignore_disabled: bool = ...) -> JaggedTensor: ...
-
     def set_global_origin(self, origin: Vec3d) -> None: ...
     def set_global_voxel_size(self, voxel_size: Vec3dOrScalar) -> None: ...
-
-    def set_from_dense_grid(self, num_grids: int, dense_dims: Vec3i, ijk_min: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., mask: Optional[torch.Tensor] = ...) -> None: ...
-    def set_from_ijk(self, ijk: JaggedTensorOrTensor, pad_min: Vec3i = ..., pad_max: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ...) -> None: ...
-    def set_from_nearest_voxels_to_points(self, points: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ...) -> None: ...
-    def set_from_points(self, points: JaggedTensorOrTensor, pad_min: Vec3i = ..., pad_max: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ...) -> None: ...
-    def set_from_mesh(self, mesh_vertices: JaggedTensorOrTensor, mesh_faces: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ...) -> None: ...
-
+    def set_from_dense_grid(
+        self,
+        num_grids: int,
+        dense_dims: Vec3i,
+        ijk_min: Vec3i = ...,
+        voxel_sizes: Vec3dBatchOrScalar = ...,
+        origins: Vec3dBatch = ...,
+        mask: Optional[torch.Tensor] = ...,
+    ) -> None: ...
+    def set_from_ijk(
+        self,
+        ijk: JaggedTensorOrTensor,
+        pad_min: Vec3i = ...,
+        pad_max: Vec3i = ...,
+        voxel_sizes: Vec3dBatchOrScalar = ...,
+        origins: Vec3dBatch = ...,
+    ) -> None: ...
+    def set_from_nearest_voxels_to_points(
+        self, points: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ...
+    ) -> None: ...
+    def set_from_points(
+        self,
+        points: JaggedTensorOrTensor,
+        pad_min: Vec3i = ...,
+        pad_max: Vec3i = ...,
+        voxel_sizes: Vec3dBatchOrScalar = ...,
+        origins: Vec3dBatch = ...,
+    ) -> None: ...
+    def set_from_mesh(
+        self,
+        mesh_vertices: JaggedTensorOrTensor,
+        mesh_faces: JaggedTensorOrTensor,
+        voxel_sizes: Vec3dBatchOrScalar = ...,
+        origins: Vec3dBatch = ...,
+    ) -> None: ...
     def read_from_dense(self, dense_data: torch.Tensor, dense_origins: Vec3iBatch = ...) -> JaggedTensor: ...
-    def read_into_dense(self, sparse_data: JaggedTensorOrTensor, min_coord: Optional[Vec3iBatch] = ..., grid_size: Optional[Vec3i] = ...) -> torch.Tensor: ...
-
-    def clip(self, features: JaggedTensorOrTensor, ijk_min: Vec3iBatch, ijk_max: Vec3iBatch) -> Tuple[JaggedTensor, GridBatch]: ...
+    def read_into_dense(
+        self, sparse_data: JaggedTensorOrTensor, min_coord: Optional[Vec3iBatch] = ..., grid_size: Optional[Vec3i] = ...
+    ) -> torch.Tensor: ...
+    def clip(
+        self, features: JaggedTensorOrTensor, ijk_min: Vec3iBatch, ijk_max: Vec3iBatch
+    ) -> Tuple[JaggedTensor, GridBatch]: ...
     def clipped_grid(self, ijk_min: Vec3iBatch, ijk_max: Vec3iBatch) -> GridBatch: ...
-
     def dual_grid(self, exclude_border: bool = False) -> GridBatch: ...
-
-    def fill_to_grid(self, features: JaggedTensor, other_grid: GridBatch, default_value: float = ...) -> JaggedTensor: ...
-
+    def fill_to_grid(
+        self, features: JaggedTensor, other_grid: GridBatch, default_value: float = ...
+    ) -> JaggedTensor: ...
     def coarsened_grid(self, coarsening_factor: Vec3iOrScalar) -> GridBatch: ...
     def subdivided_grid(self, subdiv_factor: Vec3iOrScalar, mask: JaggedTensorOrTensor = ...) -> GridBatch: ...
-
-    def max_pool(self, pool_factor: Vec3iOrScalar, data: JaggedTensorOrTensor, stride: Vec3iOrScalar = 0, coarse_grid: Optional[GridBatch] = None) ->  Tuple[JaggedTensor, GridBatch]: ...
-    def avg_pool(self, pool_factor: Vec3iOrScalar, data: JaggedTensorOrTensor, stride: Vec3iOrScalar = 0, coarse_grid: Optional[GridBatch] = None) ->  Tuple[JaggedTensor, GridBatch]: ...
-    def subdivide(self, subdiv_factor: Vec3iOrScalar, data: JaggedTensorOrTensor, mask: Optional[JaggedTensorOrTensor] = None, fine_grid: Optional[GridBatch] = None) -> Tuple[JaggedTensor, GridBatch]: ...
-
+    def max_pool(
+        self,
+        pool_factor: Vec3iOrScalar,
+        data: JaggedTensorOrTensor,
+        stride: Vec3iOrScalar = 0,
+        coarse_grid: Optional[GridBatch] = None,
+    ) -> Tuple[JaggedTensor, GridBatch]: ...
+    def avg_pool(
+        self,
+        pool_factor: Vec3iOrScalar,
+        data: JaggedTensorOrTensor,
+        stride: Vec3iOrScalar = 0,
+        coarse_grid: Optional[GridBatch] = None,
+    ) -> Tuple[JaggedTensor, GridBatch]: ...
+    def subdivide(
+        self,
+        subdiv_factor: Vec3iOrScalar,
+        data: JaggedTensorOrTensor,
+        mask: Optional[JaggedTensorOrTensor] = None,
+        fine_grid: Optional[GridBatch] = None,
+    ) -> Tuple[JaggedTensor, GridBatch]: ...
     def disable_ijk(self, ijk: JaggedTensorOrTensor) -> None: ...
     def enable_ijk(self, ijk: JaggedTensorOrTensor) -> None: ...
-
     def points_in_active_voxel(self, xyz: JaggedTensorOrTensor, ignore_disabled: bool = False) -> JaggedTensor: ...
     def coords_in_active_voxel(self, ijk: JaggedTensorOrTensor, ignore_disabled: bool = False) -> JaggedTensor: ...
-    def cubes_in_grid(self, cube_centers: JaggedTensorOrTensor, cube_min: Vec3dOrScalar = 0.0, cube_max: Vec3dOrScalar = 0.0, ignore_disabled: bool = False) -> JaggedTensor: ...
-    def cubes_intersect_grid(self, cube_centers: JaggedTensorOrTensor, cube_min: Vec3dOrScalar = 0.0, cube_max: Vec3dOrScalar = 0.0, ignore_disabled: bool = False) -> JaggedTensor: ...
-
+    def cubes_in_grid(
+        self,
+        cube_centers: JaggedTensorOrTensor,
+        cube_min: Vec3dOrScalar = 0.0,
+        cube_max: Vec3dOrScalar = 0.0,
+        ignore_disabled: bool = False,
+    ) -> JaggedTensor: ...
+    def cubes_intersect_grid(
+        self,
+        cube_centers: JaggedTensorOrTensor,
+        cube_min: Vec3dOrScalar = 0.0,
+        cube_max: Vec3dOrScalar = 0.0,
+        ignore_disabled: bool = False,
+    ) -> JaggedTensor: ...
     def ijk_to_index(self, ijk: JaggedTensorOrTensor, cumulative: bool = False) -> JaggedTensor: ...
     def ijk_to_inv_index(self, ijk: JaggedTensorOrTensor, cumulative: bool = False) -> JaggedTensor: ...
     def neighbor_indexes(self, ijk: JaggedTensorOrTensor, extent: int, bitshift: int = 0) -> JaggedTensor: ...
-
-    def splat_bezier(self,  points: JaggedTensorOrTensor, points_data: JaggedTensorOrTensor) -> JaggedTensor: ...
+    def splat_bezier(self, points: JaggedTensorOrTensor, points_data: JaggedTensorOrTensor) -> JaggedTensor: ...
     def splat_trilinear(self, points: JaggedTensorOrTensor, points_data: JaggedTensorOrTensor) -> JaggedTensor: ...
     def sample_bezier(self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor) -> JaggedTensor: ...
-    def sample_bezier_with_grad(self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor) -> Tuple[JaggedTensor, JaggedTensor]: ...
+    def sample_bezier_with_grad(
+        self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor
+    ) -> Tuple[JaggedTensor, JaggedTensor]: ...
     def sample_trilinear(self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor) -> JaggedTensor: ...
-    def sample_trilinear_with_grad(self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor) -> Tuple[JaggedTensor, JaggedTensor]: ...
-
-
-    def segments_along_rays(self, ray_origins: JaggedTensorOrTensor, ray_directions: JaggedTensorOrTensor, max_segments: int, eps: float = 0.0, ignore_masked: bool = False) -> JaggedTensor: ...
-    def voxels_along_rays(self, ray_origins: JaggedTensorOrTensor, ray_directions: JaggedTensorOrTensor, max_voxels: int, eps: float = 0.0, return_ijk: bool = True, cumulative: bool = False) -> Tuple[JaggedTensor, JaggedTensor]: ...
-    def uniform_ray_samples(self, ray_origins: JaggedTensorOrTensor, ray_directions: JaggedTensorOrTensor, t_min: JaggedTensorOrTensor, t_max: JaggedTensorOrTensor, step_size: float, cone_angle: float = 0.0, include_end_segments : bool = True, return_midpoints: bool = False, eps: float = 0.0) -> JaggedTensor: ...
-    def ray_implicit_intersection(self, ray_origins: JaggedTensorOrTensor, ray_directions: JaggedTensorOrTensor, grid_scalars: JaggedTensorOrTensor, eps: float = 0.0) -> JaggedTensor: ...
-
+    def sample_trilinear_with_grad(
+        self, points: JaggedTensorOrTensor, voxel_data: JaggedTensorOrTensor
+    ) -> Tuple[JaggedTensor, JaggedTensor]: ...
+    def segments_along_rays(
+        self,
+        ray_origins: JaggedTensorOrTensor,
+        ray_directions: JaggedTensorOrTensor,
+        max_segments: int,
+        eps: float = 0.0,
+        ignore_masked: bool = False,
+    ) -> JaggedTensor: ...
+    def voxels_along_rays(
+        self,
+        ray_origins: JaggedTensorOrTensor,
+        ray_directions: JaggedTensorOrTensor,
+        max_voxels: int,
+        eps: float = 0.0,
+        return_ijk: bool = True,
+        cumulative: bool = False,
+    ) -> Tuple[JaggedTensor, JaggedTensor]: ...
+    def uniform_ray_samples(
+        self,
+        ray_origins: JaggedTensorOrTensor,
+        ray_directions: JaggedTensorOrTensor,
+        t_min: JaggedTensorOrTensor,
+        t_max: JaggedTensorOrTensor,
+        step_size: float,
+        cone_angle: float = 0.0,
+        include_end_segments: bool = True,
+        return_midpoints: bool = False,
+        eps: float = 0.0,
+    ) -> JaggedTensor: ...
+    def ray_implicit_intersection(
+        self,
+        ray_origins: JaggedTensorOrTensor,
+        ray_directions: JaggedTensorOrTensor,
+        grid_scalars: JaggedTensorOrTensor,
+        eps: float = 0.0,
+    ) -> JaggedTensor: ...
     def grid_to_world(self, ijk: JaggedTensorOrTensor) -> JaggedTensor: ...
     def world_to_grid(self, ijk: JaggedTensorOrTensor) -> JaggedTensor: ...
-
-    def marching_cubes(self, field: JaggedTensorOrTensor, level: float = 0.0) -> Tuple[JaggedTensor, JaggedTensor, JaggedTensor]: ...
-
-    def sparse_conv_kernel_map(self, kernel_size: Union[int, Sequence], stride: Union[int, Sequence], target_grid: Optional[GridBatch] = None) -> Tuple[SparseConvPackInfo, GridBatch]: ...
+    def marching_cubes(
+        self, field: JaggedTensorOrTensor, level: float = 0.0
+    ) -> Tuple[JaggedTensor, JaggedTensor, JaggedTensor]: ...
+    def sparse_conv_kernel_map(
+        self, kernel_size: Union[int, Sequence], stride: Union[int, Sequence], target_grid: Optional[GridBatch] = None
+    ) -> Tuple[SparseConvPackInfo, GridBatch]: ...
     def sparse_conv_halo(self, input: JaggedTensorOrTensor, weight: torch.Tensor, variant: int = 8) -> JaggedTensor: ...
-
     def is_contiguous(self) -> bool: ...
     def contiguous(self) -> GridBatch: ...
-
     @overload
     def to(self, device: TorchDeviceOrString) -> GridBatch: ...
     @overload
@@ -294,7 +380,6 @@ class GridBatch:
     def to(self, to_jtensor: JaggedTensor) -> GridBatch: ...
     @overload
     def to(self, to_grid: GridBatch) -> GridBatch: ...
-
     @overload
     def __getitem__(self, arg0: int) -> GridBatch: ...
     @overload
@@ -307,26 +392,37 @@ class GridBatch:
     def __getitem__(self, arg0: torch.Tensor) -> GridBatch: ...
     @overload
     def __getitem__(self, arg0: numpy.ndarray) -> GridBatch: ...
-
     def __len__(self) -> int: ...
-
     def __iter__(self) -> Iterator[GridBatch]: ...
-
     def __getstate__(self) -> tuple: ...
     def __setstate__(self, arg0: tuple) -> None: ...
 
-
 class ConvPackBackend(Enum):
     GATHER_SCATTER = 0
     IGEMM = 1
     CUTLASS = 2
     LGGS = 3
 
-
 class SparseConvPackInfo:
-    def __init__(self, kernel_size: Vec3iOrScalar, stride: Vec3iOrScalar, source_grid: GridBatch, target_grid: Optional[GridBatch]) -> None: ...
-    def sparse_conv_3d(self, input: JaggedTensorOrTensor, weights: torch.Tensor, backend: ConvPackBackend = ConvPackBackend.GATHER_SCATTER) -> JaggedTensor: ...
-    def sparse_transpose_conv_3d(self, input: JaggedTensorOrTensor, weights: torch.Tensor, backend: ConvPackBackend = ConvPackBackend.GATHER_SCATTER) -> JaggedTensor: ...
+    def __init__(
+        self,
+        kernel_size: Vec3iOrScalar,
+        stride: Vec3iOrScalar,
+        source_grid: GridBatch,
+        target_grid: Optional[GridBatch],
+    ) -> None: ...
+    def sparse_conv_3d(
+        self,
+        input: JaggedTensorOrTensor,
+        weights: torch.Tensor,
+        backend: ConvPackBackend = ConvPackBackend.GATHER_SCATTER,
+    ) -> JaggedTensor: ...
+    def sparse_transpose_conv_3d(
+        self,
+        input: JaggedTensorOrTensor,
+        weights: torch.Tensor,
+        backend: ConvPackBackend = ConvPackBackend.GATHER_SCATTER,
+    ) -> JaggedTensor: ...
     @property
     def kernel_size(self) -> Tuple: ...
     @property
@@ -357,73 +453,126 @@ class SparseConvPackInfo:
     def halo_index_buffer(self) -> torch.Tensor: ...
     @property
     def output_index_buffer(self) -> torch.Tensor: ...
-
     @property
     def block_kernel_ranges(self) -> torch.Tensor: ...
     @property
     def block_kernel_rel_out_idx(self) -> torch.Tensor: ...
     @property
     def block_kernel_in_idx(self) -> torch.Tensor: ...
-
     @property
     def source_grid(self) -> GridBatch: ...
     @property
     def stride(self) -> Tuple: ...
     @property
     def target_grid(self) -> GridBatch: ...
-
     def build_gather_scatter(self, use_me: bool = False) -> None: ...
-    def build_implicit_gemm(self, sorted: bool = False, split_mask_num: int = 1,
-                            training: bool = False, split_mask_num_bwd: int = 1,
-                            use_tf32: bool = False) -> None: ...
+    def build_implicit_gemm(
+        self,
+        sorted: bool = False,
+        split_mask_num: int = 1,
+        training: bool = False,
+        split_mask_num_bwd: int = 1,
+        use_tf32: bool = False,
+    ) -> None: ...
     def build_cutlass(self, benchmark: bool = False) -> None: ...
     def build_lggs(self) -> None: ...
 
 @overload
 def jcat(grid_batches: List[GridBatch]) -> GridBatch: ...
-
 @overload
 def jcat(jagged_tensors: List[JaggedTensorOrTensor], dim: int | None = ...) -> JaggedTensor: ...
-
-def sparse_grid_from_ijk(ijk: JaggedTensorOrTensor, pad_min: Vec3i = ..., pad_max: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., mutable: bool = ...) -> GridBatch: ...
-def sparse_grid_from_nearest_voxels_to_points(points: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., mutable: bool = ...) -> GridBatch: ...
-def sparse_grid_from_points(points: JaggedTensorOrTensor, pad_min: Vec3i = ..., pad_max: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., mutable: bool = ...) -> GridBatch: ...
-def sparse_grid_from_dense(num_grids: int, dense_dims: Vec3i, ijk_min: Vec3i = ..., voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., device: TorchDeviceOrString = ..., mutable: bool = ...) -> GridBatch: ...
-def sparse_grid_from_mesh(vertices: JaggedTensorOrTensor, faces: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., device: TorchDeviceOrString = ..., mutable: bool = ...) -> GridBatch: ...
-
-def volume_render(sigmas: torch.Tensor, rgbs: torch.Tensor, deltaTs: torch.Tensor, ts: torch.Tensor, packInfo: torch.Tensor, transmittanceThresh: float) -> List[torch.Tensor]: ...
-
-def load(path: str, grid_id: Optional[GridIdentifier] = None, device: TorchDeviceOrString = 'cpu', verbose: bool = False) -> Tuple[GridBatch, JaggedTensor, list[str]]: ...
-def save(path: str, grid: GridBatch, data: Optional[JaggedTensorOrTensor] = None, names: Optional[Union[str , List[str]]] = None, compressed: bool = False, verbose: bool = False): ...
-
-
-def jrand(lsizes: LShapeSpec,
-          rsizes: RShapeSpec | None = None,
-          dtype: torch.dtype | None = None,
-          device: TorchDeviceOrString | None = None,
-          requires_grad: bool = False,
-          pin_memory: bool = False) -> JaggedTensor: ...
-def jrandn(lsizes: LShapeSpec,
-           rsizes: RShapeSpec | None = None,
-           dtype: torch.dtype | None = None,
-           device: TorchDeviceOrString | None = None,
-           requires_grad: bool = False,
-           pin_memory: bool = False) -> JaggedTensor: ...
-def jones(lsizes: LShapeSpec,
-          rsizes: RShapeSpec | None = None,
-          dtype: torch.dtype | None = None,
-          device: TorchDeviceOrString | None = None,
-          requires_grad: bool = False,
-          pin_memory: bool = False) -> JaggedTensor: ...
-def jzeros(lsizes: LShapeSpec,
-           rsizes: RShapeSpec | None = None,
-           dtype: torch.dtype | None = None,
-           device: TorchDeviceOrString | None = None,
-           requires_grad: bool = False,
-           pin_memory: bool = False) -> JaggedTensor: ...
-def jempty(lsizes: LShapeSpec,
-           rsizes: RShapeSpec | None = None,
-           dtype: torch.dtype | None = None,
-           device: TorchDeviceOrString | None = None,
-           requires_grad: bool = False,
-           pin_memory: bool = False) -> JaggedTensor: ...
\ No newline at end of file
+def sparse_grid_from_ijk(
+    ijk: JaggedTensorOrTensor,
+    pad_min: Vec3i = ...,
+    pad_max: Vec3i = ...,
+    voxel_sizes: Vec3dBatchOrScalar = ...,
+    origins: Vec3dBatch = ...,
+    mutable: bool = ...,
+) -> GridBatch: ...
+def sparse_grid_from_nearest_voxels_to_points(
+    points: JaggedTensorOrTensor, voxel_sizes: Vec3dBatchOrScalar = ..., origins: Vec3dBatch = ..., mutable: bool = ...
+) -> GridBatch: ...
+def sparse_grid_from_points(
+    points: JaggedTensorOrTensor,
+    pad_min: Vec3i = ...,
+    pad_max: Vec3i = ...,
+    voxel_sizes: Vec3dBatchOrScalar = ...,
+    origins: Vec3dBatch = ...,
+    mutable: bool = ...,
+) -> GridBatch: ...
+def sparse_grid_from_dense(
+    num_grids: int,
+    dense_dims: Vec3i,
+    ijk_min: Vec3i = ...,
+    voxel_sizes: Vec3dBatchOrScalar = ...,
+    origins: Vec3dBatch = ...,
+    device: TorchDeviceOrString = ...,
+    mutable: bool = ...,
+) -> GridBatch: ...
+def sparse_grid_from_mesh(
+    vertices: JaggedTensorOrTensor,
+    faces: JaggedTensorOrTensor,
+    voxel_sizes: Vec3dBatchOrScalar = ...,
+    origins: Vec3dBatch = ...,
+    device: TorchDeviceOrString = ...,
+    mutable: bool = ...,
+) -> GridBatch: ...
+def volume_render(
+    sigmas: torch.Tensor,
+    rgbs: torch.Tensor,
+    deltaTs: torch.Tensor,
+    ts: torch.Tensor,
+    packInfo: torch.Tensor,
+    transmittanceThresh: float,
+) -> List[torch.Tensor]: ...
+def load(
+    path: str, grid_id: Optional[GridIdentifier] = None, device: TorchDeviceOrString = "cpu", verbose: bool = False
+) -> Tuple[GridBatch, JaggedTensor, list[str]]: ...
+def save(
+    path: str,
+    grid: GridBatch,
+    data: Optional[JaggedTensorOrTensor] = None,
+    names: Optional[Union[str, List[str]]] = None,
+    compressed: bool = False,
+    verbose: bool = False,
+): ...
+def jrand(
+    lsizes: LShapeSpec,
+    rsizes: RShapeSpec | None = None,
+    dtype: torch.dtype | None = None,
+    device: TorchDeviceOrString | None = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> JaggedTensor: ...
+def jrandn(
+    lsizes: LShapeSpec,
+    rsizes: RShapeSpec | None = None,
+    dtype: torch.dtype | None = None,
+    device: TorchDeviceOrString | None = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> JaggedTensor: ...
+def jones(
+    lsizes: LShapeSpec,
+    rsizes: RShapeSpec | None = None,
+    dtype: torch.dtype | None = None,
+    device: TorchDeviceOrString | None = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> JaggedTensor: ...
+def jzeros(
+    lsizes: LShapeSpec,
+    rsizes: RShapeSpec | None = None,
+    dtype: torch.dtype | None = None,
+    device: TorchDeviceOrString | None = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> JaggedTensor: ...
+def jempty(
+    lsizes: LShapeSpec,
+    rsizes: RShapeSpec | None = None,
+    dtype: torch.dtype | None = None,
+    device: TorchDeviceOrString | None = None,
+    requires_grad: bool = False,
+    pin_memory: bool = False,
+) -> JaggedTensor: ...
diff --git a/fvdb/fvdb/__init__.py b/fvdb/fvdb/__init__.py
index 0f84cbb3d9..10c1961447 100644
--- a/fvdb/fvdb/__init__.py
+++ b/fvdb/fvdb/__init__.py
@@ -11,18 +11,30 @@
 
 from . import utils
 from ._Cpp import JaggedTensor, GridBatch, SparseConvPackInfo, ConvPackBackend
-from ._Cpp import (jcat, sparse_grid_from_ijk, sparse_grid_from_points,
-                   sparse_grid_from_nearest_voxels_to_points,
-                   sparse_grid_from_dense, sparse_grid_from_mesh, load, save,
-                   scaled_dot_product_attention, config,
-                   jrand, jrandn, jones, jzeros, jempty)
+from ._Cpp import (
+    jcat,
+    sparse_grid_from_ijk,
+    sparse_grid_from_points,
+    sparse_grid_from_nearest_voxels_to_points,
+    sparse_grid_from_dense,
+    sparse_grid_from_mesh,
+    load,
+    save,
+    scaled_dot_product_attention,
+    config,
+    jrand,
+    jrandn,
+    jones,
+    jzeros,
+    jempty,
+)
 
 # The following import needs to come after the GridBatch and JaggedTensor imports
 # immediately above in order to avoid a circular dependency error.
 from . import nn
 
 
-__version__ = '0.0.1'
+__version__ = "0.0.1"
 __version_info__ = (0, 0, 1)
 
 __all__ = [
diff --git a/fvdb/fvdb/nn/modules.py b/fvdb/fvdb/nn/modules.py
index 830be512ec..7917bde6ac 100644
--- a/fvdb/fvdb/nn/modules.py
+++ b/fvdb/fvdb/nn/modules.py
@@ -16,9 +16,11 @@
 def fvnn_module(module):
     # Register class as a module in fvdb.nn
     old_forward = module.forward
+
     def _forward(self, *args, **kwargs):
         with record_function(repr(self)):
             return old_forward(self, *args, **kwargs)
+
     module.forward = _forward
     return module
 
@@ -26,6 +28,7 @@ def _forward(self, *args, **kwargs):
 GridOrVDBTensor = Union[fvdb.GridBatch, VDBTensor]
 ListOrInt = Union[int, List[int]]
 
+
 @fvnn_module
 class MaxPool(nn.Module):
     r"""Applies a 3D max pooling over an input signal.
@@ -45,8 +48,7 @@ def __init__(self, kernel_size: ListOrInt, stride: Optional[ListOrInt] = None):
         self.kernel_size = kernel_size
         self.stride = stride or self.kernel_size
 
-    def forward(self, input: VDBTensor,
-                ref_coarse_data: Optional[GridOrVDBTensor] = None) -> VDBTensor:
+    def forward(self, input: VDBTensor, ref_coarse_data: Optional[GridOrVDBTensor] = None) -> VDBTensor:
         if isinstance(ref_coarse_data, VDBTensor):
             coarse_grid, coarse_kmap = ref_coarse_data.grid, ref_coarse_data.kmap
         elif isinstance(ref_coarse_data, fvdb.GridBatch):
@@ -55,16 +57,13 @@ def forward(self, input: VDBTensor,
             coarse_grid, coarse_kmap = None, None
 
         new_feature, new_grid = input.grid.max_pool(
-            self.kernel_size, input.feature, stride=self.stride,
-            coarse_grid=coarse_grid
+            self.kernel_size, input.feature, stride=self.stride, coarse_grid=coarse_grid
         )
         new_feature.jdata[torch.isinf(new_feature.jdata)] = 0.0
         return VDBTensor(new_grid, new_feature, kmap=coarse_kmap)
 
     def extra_repr(self) -> str:
-        return "kernel_size={kernel_size}, stride={stride}".format(
-            kernel_size=self.kernel_size, stride=self.stride
-        )
+        return "kernel_size={kernel_size}, stride={stride}".format(kernel_size=self.kernel_size, stride=self.stride)
 
 
 @fvnn_module
@@ -76,13 +75,13 @@ class AvgPool(nn.Module):
         stride: the stride of the window. Default value is :attr:`kernel_size`
 
     """
+
     def __init__(self, kernel_size: ListOrInt, stride: Optional[ListOrInt] = None):
         super().__init__()
         self.kernel_size = kernel_size
         self.stride = stride or self.kernel_size
 
-    def forward(self, input: VDBTensor,
-                ref_coarse_data: Optional[GridOrVDBTensor] = None) -> VDBTensor:
+    def forward(self, input: VDBTensor, ref_coarse_data: Optional[GridOrVDBTensor] = None) -> VDBTensor:
         if isinstance(ref_coarse_data, VDBTensor):
             coarse_grid, coarse_kmap = ref_coarse_data.grid, ref_coarse_data.kmap
         elif isinstance(ref_coarse_data, fvdb.GridBatch):
@@ -91,15 +90,12 @@ def forward(self, input: VDBTensor,
             coarse_grid, coarse_kmap = None, None
 
         new_feature, new_grid = input.grid.avg_pool(
-            self.kernel_size, input.feature, stride=self.stride,
-            coarse_grid=coarse_grid
+            self.kernel_size, input.feature, stride=self.stride, coarse_grid=coarse_grid
         )
         return VDBTensor(new_grid, new_feature, kmap=coarse_kmap)
 
     def extra_repr(self) -> str:
-        return "kernel_size={kernel_size}, stride={stride}".format(
-            kernel_size=self.kernel_size, stride=self.stride
-        )
+        return "kernel_size={kernel_size}, stride={stride}".format(kernel_size=self.kernel_size, stride=self.stride)
 
 
 @fvnn_module
@@ -109,13 +105,13 @@ class UpsamplingNearest(nn.Module):
     Args:
         scale_factor: the upsampling factor
     """
+
     def __init__(self, scale_factor: ListOrInt):
         super().__init__()
         self.scale_factor = scale_factor
 
     def forward(
-        self, input: VDBTensor, mask: Optional[JaggedTensor] = None,
-        ref_fine_data: Optional[GridOrVDBTensor] = None
+        self, input: VDBTensor, mask: Optional[JaggedTensor] = None, ref_fine_data: Optional[GridOrVDBTensor] = None
     ) -> VDBTensor:
         if isinstance(ref_fine_data, VDBTensor):
             fine_grid, fine_kmap = ref_fine_data.grid, ref_fine_data.kmap
@@ -124,9 +120,7 @@ def forward(
         else:
             fine_grid, fine_kmap = None, None
 
-        new_feature, new_grid = input.grid.subdivide(
-            self.scale_factor, input.feature, mask, fine_grid=fine_grid
-        )
+        new_feature, new_grid = input.grid.subdivide(self.scale_factor, input.feature, mask, fine_grid=fine_grid)
         return VDBTensor(new_grid, new_feature, kmap=fine_kmap)
 
     def extra_repr(self) -> str:
@@ -141,6 +135,7 @@ class FillToGrid(nn.Module):
     Args:
         default_value: the default value to fill in the new grid.
     """
+
     def __init__(self, default_value: float = 0.0) -> None:
         super().__init__()
         self.default_value = default_value
@@ -172,9 +167,21 @@ class SparseConv3d(nn.Module):
     """
 
     CUTLASS_SUPPORTED_CHANNELS = [
-        (32, 64), (64, 128), (128, 256), (32, 32), (64, 64), (128, 128),
-        (256, 256), (128, 64), (64, 32), (256, 128), (384, 256), (192, 128),
-        (256, 512), (512, 256), (512, 512)
+        (32, 64),
+        (64, 128),
+        (128, 256),
+        (32, 32),
+        (64, 64),
+        (128, 128),
+        (256, 256),
+        (128, 64),
+        (64, 32),
+        (256, 128),
+        (384, 256),
+        (192, 128),
+        (256, 512),
+        (512, 256),
+        (512, 512),
     ]
 
     """
@@ -200,7 +207,7 @@ def __init__(
         kernel_size: Union[int, Sequence] = 3,
         stride: Union[int, Sequence] = 1,
         bias: bool = True,
-        transposed: bool = False
+        transposed: bool = False,
     ) -> None:
 
         super().__init__()
@@ -251,10 +258,7 @@ def extra_repr(self) -> str:
         return s.format(**self.__dict__)
 
     def reset_parameters(self) -> None:
-        std = 1 / math.sqrt(
-            (self.out_channels if self.transposed else self.in_channels)
-            * self.kernel_volume
-        )
+        std = 1 / math.sqrt((self.out_channels if self.transposed else self.in_channels) * self.kernel_volume)
         self.weight.data.uniform_(-std, std)
         if self.bias is not None:
             self.bias.data.uniform_(-std, std)
@@ -263,9 +267,12 @@ def _dispatch_conv(self, in_feature, in_grid, in_kmap, out_grid):
 
         backend = self.backend
 
-        if backend == "cutlass" and ((not self.weight.is_cuda) or
-                (self.in_channels, self.out_channels) not in self.CUTLASS_SUPPORTED_CHANNELS):
-            print(f"Cutlass backend does not support {self.in_channels} -> {self.out_channels} convolutions, falling back to default")
+        if backend == "cutlass" and (
+            (not self.weight.is_cuda) or (self.in_channels, self.out_channels) not in self.CUTLASS_SUPPORTED_CHANNELS
+        ):
+            print(
+                f"Cutlass backend does not support {self.in_channels} -> {self.out_channels} convolutions, falling back to default"
+            )
             backend = "default"
 
         if backend == "lggs" and ((self.in_channels, self.out_channels) not in [(128, 128)]):
@@ -287,9 +294,7 @@ def _dispatch_conv(self, in_feature, in_grid, in_kmap, out_grid):
             min_coord = in_grid.ijk.jdata.min(axis=0).values
             # BWHDC -> BCDHW
             dense_feature = in_grid.read_into_dense(in_feature, min_coord=min_coord).permute(0, 4, 3, 2, 1)
-            dense_feature = torch.nn.functional.conv3d(
-                dense_feature, self.weight, padding=1, stride=1
-            )
+            dense_feature = torch.nn.functional.conv3d(dense_feature, self.weight, padding=1, stride=1)
             # BCDHW -> BWHDC
             dense_feature = dense_feature.permute(0, 4, 3, 2, 1).contiguous()
             dense_feature = in_grid.read_from_dense(dense_feature, dense_origins=min_coord)
@@ -305,13 +310,9 @@ def _dispatch_conv(self, in_feature, in_grid, in_kmap, out_grid):
             else:
                 if self.transposed:
                     assert out_grid is not None
-                    kmap, _ = out_grid.sparse_conv_kernel_map(
-                        self.kernel_size, self.stride, in_grid
-                    )
+                    kmap, _ = out_grid.sparse_conv_kernel_map(self.kernel_size, self.stride, in_grid)
                 else:
-                    kmap, out_grid = in_grid.sparse_conv_kernel_map(
-                        self.kernel_size, self.stride, out_grid
-                    )
+                    kmap, out_grid = in_grid.sparse_conv_kernel_map(self.kernel_size, self.stride, out_grid)
 
             out_kmap = kmap if can_cache else None
 
@@ -335,17 +336,20 @@ def _build_kmap_and_convert_backend(self, kmap: fvdb.SparseConvPackInfo, backend
 
         elif backend == "igemm_mode0":
             kmap.build_implicit_gemm(
-                sorted=False, split_mask_num=1, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32)
+                sorted=False, split_mask_num=1, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32
+            )
             return fvdb.ConvPackBackend.IGEMM
 
         elif backend == "igemm_mode1":
             kmap.build_implicit_gemm(
-                sorted=True, split_mask_num=1, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32)
+                sorted=True, split_mask_num=1, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32
+            )
             return fvdb.ConvPackBackend.IGEMM
 
         elif backend == "igemm_mode2":
             kmap.build_implicit_gemm(
-                sorted=True, split_mask_num=3, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32)
+                sorted=True, split_mask_num=3, training=self.training, split_mask_num_bwd=3, use_tf32=self.allow_tf32
+            )
             return fvdb.ConvPackBackend.IGEMM
 
         elif backend == "lggs":
@@ -368,9 +372,7 @@ def forward(
             out_grid, out_kmap = in_grid, in_kmap
 
         else:
-            out_grid, out_feature, out_kmap = self._dispatch_conv(
-                in_feature, in_grid, in_kmap, out_grid
-            )
+            out_grid, out_feature, out_kmap = self._dispatch_conv(in_feature, in_grid, in_kmap, out_grid)
 
         if self.bias is not None:
             out_feature.jdata = out_feature.jdata + self.bias
@@ -383,6 +385,7 @@ class GroupNorm(nn.GroupNorm):
     r"""Applies Group Normalization over a VDBTensor.
     See :class:`~torch.nn.GroupNorm` for detailed information.
     """
+
     def forward(self, input: VDBTensor) -> VDBTensor:
         num_channels = input.feature.jdata.size(1)
         assert num_channels == self.num_channels, "Input feature should have the same number of channels as GroupNorm"
@@ -393,13 +396,13 @@ def forward(self, input: VDBTensor) -> VDBTensor:
         result_data = torch.empty_like(flat_data)
 
         for b in range(num_batches):
-            feat = flat_data[flat_offsets[b]:flat_offsets[b+1]]
+            feat = flat_data[flat_offsets[b] : flat_offsets[b + 1]]
             if feat.size(0) != 0:
                 feat = feat.transpose(0, 1).reshape(1, num_channels, -1)
                 feat = super().forward(feat)
                 feat = feat.reshape(num_channels, -1).transpose(0, 1)
 
-                result_data[flat_offsets[b]:flat_offsets[b+1]] = feat
+                result_data[flat_offsets[b] : flat_offsets[b + 1]] = feat
 
         return VDBTensor(input.grid, input.grid.jagged_like(result_data), input.kmap)
 
@@ -409,6 +412,7 @@ class BatchNorm(nn.BatchNorm1d):
     r"""Applies Batch Normalization over a VDBTensor.
     See :class:`~torch.nn.BatchNorm1d` for detailed information.
     """
+
     def forward(self, input: VDBTensor) -> VDBTensor:
         num_channels = input.feature.jdata.size(1)
         assert num_channels == self.num_features, "Input feature should have the same number of channels as BatchNorm"
@@ -420,7 +424,7 @@ def forward(self, input: VDBTensor) -> VDBTensor:
 class ElementwiseMixin:
     def forward(self, input: VDBTensor) -> VDBTensor:
         assert isinstance(input, VDBTensor), "Input should have type VDBTensor"
-        res = super().forward(input.feature.jdata) # type: ignore
+        res = super().forward(input.feature.jdata)  # type: ignore
         return VDBTensor(input.grid, input.feature.jagged_like(res), input.kmap)
 
 
diff --git a/fvdb/fvdb/nn/vdbtensor.py b/fvdb/fvdb/nn/vdbtensor.py
index 4c0fd42ecf..9352a069ee 100644
--- a/fvdb/fvdb/nn/vdbtensor.py
+++ b/fvdb/fvdb/nn/vdbtensor.py
@@ -37,9 +37,11 @@ def __post_init__(self):
         if self.grid.total_voxels != self.feature.jdata.size(0):
             raise ValueError("grid and feature should have the same total voxel count")
         if self.kmap is not None:
-            if not (self.same_grid(self.kmap.source_grid, self.grid) and
-                    self.same_grid(self.kmap.target_grid, self.grid) and
-                    self.kmap.stride == (1, 1, 1)):
+            if not (
+                self.same_grid(self.kmap.source_grid, self.grid)
+                and self.same_grid(self.kmap.target_grid, self.grid)
+                and self.kmap.stride == (1, 1, 1)
+            ):
                 raise ValueError("kmap should operate on the same grid as this tensor")
 
     @staticmethod
@@ -50,10 +52,10 @@ def type(self, arg0: torch.dtype):
         return VDBTensor(self.grid, self.feature.type(arg0))
 
     def cpu(self):
-        return VDBTensor(self.grid.to('cpu'), self.feature.cpu())
+        return VDBTensor(self.grid.to("cpu"), self.feature.cpu())
 
     def cuda(self):
-        return VDBTensor(self.grid.to('cuda'), self.feature.cuda())
+        return VDBTensor(self.grid.to("cuda"), self.feature.cuda())
 
     def to(self, device: Any):
         return VDBTensor(self.grid.to(device), self.feature.to(device))
@@ -107,8 +109,8 @@ def cat(tensors: List[Union["VDBTensor", JaggedTensor, torch.Tensor]], dim: int
         assert len(tensors) > 0, "At least one tensor should be provided"
         if dim == 0:
             assert all(isinstance(t, VDBTensor) for t in tensors), "All tensors should be of type VDBTensor"
-            new_grid = fvdb.jcat([t.grid for t in tensors]) # type: ignore
-            new_feature = new_grid.jagged_like(torch.cat([t.feature.jdata for t in tensors])) # type: ignore
+            new_grid = fvdb.jcat([t.grid for t in tensors])  # type: ignore
+            new_feature = new_grid.jagged_like(torch.cat([t.feature.jdata for t in tensors]))  # type: ignore
             return VDBTensor(new_grid, new_feature)
         else:
             return VDBTensor._feature_ops(lambda *t: torch.cat(t, dim=dim), tensors)
@@ -122,8 +124,12 @@ def from_dense(dense_feature: torch.Tensor, ijk_min=None, origins=None, voxel_si
         if ijk_min is None:
             ijk_min = [0, 0, 0]
         grid = fvdb.sparse_grid_from_dense(
-            dense_feature.size(0), dense_feature.size()[1:4], ijk_min=ijk_min,
-            voxel_sizes=voxel_sizes, origins=origins, device=dense_feature.device
+            dense_feature.size(0),
+            dense_feature.size()[1:4],
+            ijk_min=ijk_min,
+            voxel_sizes=voxel_sizes,
+            origins=origins,
+            device=dense_feature.device,
         )
         # Note: this would map dense_feature[0, 0, 0] to grid[ijk_min]
         feature = grid.read_from_dense(dense_feature.contiguous(), dense_origins=ijk_min)
diff --git a/fvdb/fvdb/utils/__init__.py b/fvdb/fvdb/utils/__init__.py
index 7a7f44504a..7197162af3 100644
--- a/fvdb/fvdb/utils/__init__.py
+++ b/fvdb/fvdb/utils/__init__.py
@@ -1,4 +1,4 @@
 # Copyright Contributors to the OpenVDB Project
 # SPDX-License-Identifier: MPL-2.0
 #
-from .._Cpp import volume_render
\ No newline at end of file
+from .._Cpp import volume_render
diff --git a/fvdb/fvdb/utils/build_ext.py b/fvdb/fvdb/utils/build_ext.py
index 70421eab54..44c5dc506c 100644
--- a/fvdb/fvdb/utils/build_ext.py
+++ b/fvdb/fvdb/utils/build_ext.py
@@ -30,29 +30,29 @@ def FVDBExtension(name, sources, *args, **kwargs):
     :return: A :class:`torch.utils.cpp_extension.CppExtension` object.
     """
 
-    libraries = kwargs.get('libraries', [])
-    libraries.append('fvdb')
-    kwargs['libraries'] = libraries
+    libraries = kwargs.get("libraries", [])
+    libraries.append("fvdb")
+    kwargs["libraries"] = libraries
 
-    library_dirs = kwargs.get('library_dirs', [])
+    library_dirs = kwargs.get("library_dirs", [])
     library_dirs.append(os.path.dirname(fvdb.__file__))
-    kwargs['library_dirs'] = library_dirs
+    kwargs["library_dirs"] = library_dirs
 
-    include_dirs = kwargs.get('include_dirs', [])
-    include_dirs.append(os.path.join(os.path.dirname(fvdb.__file__), 'include'))
+    include_dirs = kwargs.get("include_dirs", [])
+    include_dirs.append(os.path.join(os.path.dirname(fvdb.__file__), "include"))
 
     # We also need to add this because fvdb internally will refer to their headers without the fvdb/ prefix.
-    include_dirs.append(os.path.join(os.path.dirname(fvdb.__file__), 'include/fvdb'))
-    kwargs['include_dirs'] = include_dirs
-
-    extra_link_args = kwargs.get('extra_link_args', [])
-    extra_link_args.append(f'-Wl,-rpath={os.path.dirname(fvdb.__file__)}')
-    kwargs['extra_link_args'] = extra_link_args
-
-    extra_compile_args = kwargs.get('extra_compile_args', {})
-    extra_compile_args['nvcc'] = extra_compile_args.get('nvcc', [])
-    if '--extended-lambda' not in extra_compile_args['nvcc']:
-        extra_compile_args['nvcc'].append('--extended-lambda')
-    kwargs['extra_compile_args'] = extra_compile_args
+    include_dirs.append(os.path.join(os.path.dirname(fvdb.__file__), "include/fvdb"))
+    kwargs["include_dirs"] = include_dirs
+
+    extra_link_args = kwargs.get("extra_link_args", [])
+    extra_link_args.append(f"-Wl,-rpath={os.path.dirname(fvdb.__file__)}")
+    kwargs["extra_link_args"] = extra_link_args
+
+    extra_compile_args = kwargs.get("extra_compile_args", {})
+    extra_compile_args["nvcc"] = extra_compile_args.get("nvcc", [])
+    if "--extended-lambda" not in extra_compile_args["nvcc"]:
+        extra_compile_args["nvcc"].append("--extended-lambda")
+    kwargs["extra_compile_args"] = extra_compile_args
 
     return cpp_extension.CUDAExtension(name, sources, *args, **kwargs)
diff --git a/fvdb/scripts/rename_wheels.py b/fvdb/scripts/rename_wheels.py
index d868324a23..c0d2f9da46 100644
--- a/fvdb/scripts/rename_wheels.py
+++ b/fvdb/scripts/rename_wheels.py
@@ -15,11 +15,7 @@
     wheel = os.path.basename(wheel)
     filename, ext = os.path.splitext(wheel)
     tags = filename.split("-")
-    new_filename = "-".join(
-        tags[:-4]
-        + [tags[-4] + "+" + "torch" + torch_version + "+" + cuda_version]
-        + tags[-3:]
-    )
+    new_filename = "-".join(tags[:-4] + [tags[-4] + "+" + "torch" + torch_version + "+" + cuda_version] + tags[-3:])
     new_filename += ext
     print(f"Renaming {wheel} -> {new_filename}")
     os.rename(os.path.join("dist", wheel), os.path.join("dist", new_filename))
diff --git a/fvdb/setup.py b/fvdb/setup.py
index a82b8c71c0..6de3f20d4b 100644
--- a/fvdb/setup.py
+++ b/fvdb/setup.py
@@ -3,25 +3,32 @@
 #
 import os
 import re
-import subprocess
 import shutil
-import requests
-from tqdm import tqdm
+import subprocess
+import tarfile
 from pathlib import Path
 
 import git
 import git.repo
-from git.exc import InvalidGitRepositoryError, GitCommandError
-import tarfile
+import requests
+from git.exc import GitCommandError, InvalidGitRepositoryError
 from setuptools import setup
 from torch.utils import cpp_extension
+from tqdm import tqdm
+
+is_conda_env = "CONDA_PREFIX" in os.environ
+if is_conda_env:
+    os.environ["CXX"] = "x86_64-conda-linux-gnu-g++"
+    os.environ["NVCC_CCBIN"] = "x86_64-conda-linux-gnu-gcc"
+
 
 def get_nanovdb_source_dir():
-    nanovdb_source_dir = '../nanovdb'
+    nanovdb_source_dir = "../nanovdb"
     if not os.path.exists(nanovdb_source_dir):
-        nanovdb_source_dir = 'external/openvdb/nanovdb'
+        nanovdb_source_dir = "external/openvdb/nanovdb"
     return nanovdb_source_dir
 
+
 class FVDBBuildCommand(cpp_extension.BuildExtension):
 
     @staticmethod
@@ -38,7 +45,7 @@ def is_git_repo(repo_path: str):
     @staticmethod
     def download_external_dep(name: str, git_url: str, git_tag: str, recursive: bool = False):
         based = os.path.dirname(os.path.abspath(__file__))
-        external_path = os.path.join(based, 'external')
+        external_path = os.path.join(based, "external")
         if not os.path.exists(external_path):
             os.makedirs(external_path, exist_ok=True)
         elif not os.path.isdir(external_path):
@@ -53,7 +60,7 @@ def download_external_dep(name: str, git_url: str, git_tag: str, recursive: bool
                 raise ValueError(f"A path {repo_path} exists but is not a git repo")
         else:
             if recursive:
-                repo = git.repo.Repo.clone_from(git_url, repo_path, multi_options=['--recursive'])
+                repo = git.repo.Repo.clone_from(git_url, repo_path, multi_options=["--recursive"])
             else:
                 repo = git.repo.Repo.clone_from(git_url, repo_path)
             repo.git.checkout(git_tag)
@@ -62,89 +69,87 @@ def download_external_dep(name: str, git_url: str, git_tag: str, recursive: bool
 
     @staticmethod
     def build_cmake_project(base_path, cmake_args):
-        cmake_build_dir = os.path.join(base_path, 'build')
-        cmake_install_dir = os.path.join(base_path, 'install')
+        cmake_build_dir = os.path.join(base_path, "build")
+        cmake_install_dir = os.path.join(base_path, "install")
         os.makedirs(cmake_build_dir, exist_ok=True)
         os.makedirs(cmake_install_dir, exist_ok=True)
-        subprocess.check_call(['cmake', base_path, f'-DCMAKE_INSTALL_PREFIX={cmake_install_dir}'] + cmake_args,
-                              cwd=cmake_build_dir)
-        subprocess.check_call(['cmake', '--build', '.', '--target', 'install'],
-                              cwd=cmake_build_dir)
+        subprocess.check_call(
+            ["cmake", base_path, f"-DCMAKE_INSTALL_PREFIX={cmake_install_dir}"] + cmake_args, cwd=cmake_build_dir
+        )
+        subprocess.check_call(["cmake", "--build", ".", "--target", "install"], cwd=cmake_build_dir)
         return cmake_install_dir
 
     def build_extension(self, _ext):
-        path = os.path.join(self.build_lib, 'fvdb')
+        path = os.path.join(self.build_lib, "fvdb")
 
-        if _ext.name == 'fvdb._Cpp':
+        if _ext.name == "fvdb._Cpp":
             _ext.library_dirs.append(path)
 
         super().build_extension(_ext)
 
-        if _ext.name == 'fvdb.fvdblib':
-            if os.path.exists(os.path.join(path, 'libfvdb.so')):
-                os.remove(os.path.join(path, 'libfvdb.so'))
+        if _ext.name == "fvdb.fvdblib":
+            if os.path.exists(os.path.join(path, "libfvdb.so")):
+                os.remove(os.path.join(path, "libfvdb.so"))
 
             # Find the .so file in the fvdb subdirectory of self.build_lib
             # assert that there is only a single one.
-            so_files = [os.path.join(path, t) for t in os.listdir(path) if t.endswith('.so') and t.startswith('fvdblib')]
+            so_files = [
+                os.path.join(path, t) for t in os.listdir(path) if t.endswith(".so") and t.startswith("fvdblib")
+            ]
             assert len(so_files) == 1
 
             # Copy the file in so_files[0] to lib/libfvdb.so
-            shutil.copy(so_files[0], os.path.join(path, 'libfvdb.so'))
+            shutil.copy(so_files[0], os.path.join(path, "libfvdb.so"))
 
             # Also copy the file to the appropriate directory if installing inplace
             if self.old_inplace:
-                build_py = self.get_finalized_command('build_py')
-                inplace_file, regular_file = self._get_inplace_equivalent(build_py, _ext) # type: ignore
-                inplace_file = os.path.join(os.path.dirname(inplace_file), 'libfvdb.so')
-                regular_file = os.path.join(os.path.dirname(regular_file), 'libfvdb.so')
-                self.copy_file(regular_file, inplace_file, level=self.verbose) # type: ignore
+                build_py = self.get_finalized_command("build_py")
+                inplace_file, regular_file = self._get_inplace_equivalent(build_py, _ext)  # type: ignore
+                inplace_file = os.path.join(os.path.dirname(inplace_file), "libfvdb.so")
+                regular_file = os.path.join(os.path.dirname(regular_file), "libfvdb.so")
+                self.copy_file(regular_file, inplace_file, level=self.verbose)  # type: ignore
 
     def run(self) -> None:
         # A sibling nanovdb source directory will exist if fvdb is being built as part of OpenVDB
-        sibling_nanovdb_dir = Path('../nanovdb')
+        sibling_nanovdb_dir = Path("../nanovdb")
         if not sibling_nanovdb_dir.exists():
             openvdb_url = "https://github.com/kmuseth/openvdb.git"
-            self.download_external_dep(
-                name='openvdb',
-                git_url=openvdb_url,
-                git_tag='feature/nanovdb_v32.7'
-            )
+            self.download_external_dep(name="openvdb", git_url=openvdb_url, git_tag="feature/nanovdb_v32.7")
 
         _, cutlass_repo = self.download_external_dep(
-            name='cutlass',
-            git_url='https://github.com/NVIDIA/cutlass.git',
-            git_tag='v3.4.0'
+            name="cutlass", git_url="https://github.com/NVIDIA/cutlass.git", git_tag="v3.4.0"
         )
         try:
             # NOTE:  In python <=3.8, __file__ will be a relative path and >3.8 it is an absolute path
-            cutlass_repo.git.apply(Path(__file__).resolve().parent / 'env' / 'cutlass.patch')
+            cutlass_repo.git.apply(Path(__file__).resolve().parent / "env" / "cutlass.patch")
         except GitCommandError as e:
             print(f"Failed to apply cutlass patch: {str(e)}, continuing without patching")
 
         self.download_external_dep(
-            name='cudnn_fe',
-            git_url='https://github.com/NVIDIA/cudnn-frontend',
-            git_tag='v1.3.0'
+            name="cudnn_fe", git_url="https://github.com/NVIDIA/cudnn-frontend", git_tag="v1.3.0"
         )
 
         blosc_source_dir, _ = self.download_external_dep(
-            name='c-blosc',
-            git_url='https://github.com/Blosc/c-blosc.git',
-            git_tag='v1.21.4'
+            name="c-blosc", git_url="https://github.com/Blosc/c-blosc.git", git_tag="v1.21.4"
+        )
+        self.build_cmake_project(
+            blosc_source_dir,
+            [
+                "-DBUILD_SHARED=OFF",
+                "-DBUILD_TESTS=OFF",
+                "-DBUILD_FUZZERS=OFF",
+                "-DBUILD_BENCHMARKS=OFF",
+                "-DCMAKE_POSITION_INDEPENDENT_CODE=ON",
+            ],
         )
-        self.build_cmake_project(blosc_source_dir, [
-            "-DBUILD_SHARED=OFF", "-DBUILD_TESTS=OFF", "-DBUILD_FUZZERS=OFF", "-DBUILD_BENCHMARKS=OFF",
-            "-DCMAKE_POSITION_INDEPENDENT_CODE=ON"
-        ])
 
         self.old_inplace = self.inplace
         super().run()
 
         # Find all the headers and copy them into the build directory.
         # This way extension modules of FVDB can include them.
-        fvdb_headers = get_header_files_recursive('src', 'fvdb')
-        nanovdb_headers = get_header_files_recursive(get_nanovdb_source_dir(), 'nanovdb')
+        fvdb_headers = get_header_files_recursive("src", "fvdb")
+        nanovdb_headers = get_header_files_recursive(get_nanovdb_source_dir(), "nanovdb")
 
         for header_folder, header_files in fvdb_headers + nanovdb_headers:
             os.makedirs(os.path.join(self.build_lib, header_folder), exist_ok=True)
@@ -156,7 +161,7 @@ def run(self) -> None:
 def get_source_files_recursive(base_path, include_bindings=True):
     source_files = []
     for dir_name, _, dir_files in os.walk(base_path):
-        if not include_bindings and os.path.basename(dir_name) == 'python':
+        if not include_bindings and os.path.basename(dir_name) == "python":
             continue
         cpp_files = [os.path.join(dir_name, t) for t in dir_files if t.endswith(".cpp")]
         cu_files = [os.path.join(dir_name, t) for t in dir_files if t.endswith(".cu")]
@@ -165,11 +170,11 @@ def get_source_files_recursive(base_path, include_bindings=True):
 
 
 def get_header_files_recursive(base_path, new_path):
-    base_len = len(base_path.split('/'))
+    base_len = len(base_path.split("/"))
     source_files = []
     for dir_name, _, dir_files in os.walk(base_path):
         header_files = [os.path.join(dir_name, t) for t in dir_files if t.endswith(".h") or t.endswith(".cuh")]
-        header_folder = [os.path.join('fvdb/include', new_path, *(h.split('/')[base_len:-1])) for h in header_files]
+        header_folder = [os.path.join("fvdb/include", new_path, *(h.split("/")[base_len:-1])) for h in header_files]
 
         # All items of header_folder should be the same
         if len(header_folder) > 0:
@@ -179,8 +184,10 @@ def get_header_files_recursive(base_path, new_path):
 
 
 def download_and_install_cudnn():
-    url = "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/" + \
-          "cudnn-linux-x86_64-9.1.0.70_cuda12-archive.tar.xz"
+    url = (
+        "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/"
+        + "cudnn-linux-x86_64-9.1.0.70_cuda12-archive.tar.xz"
+    )
     cwd = os.path.dirname(os.path.abspath(__file__))
     tar_filepath = os.path.join(cwd, "external/cudnn.tar.xz")
     folder_filepath = os.path.join(cwd, "external/cudnn")
@@ -224,21 +231,21 @@ def download_and_install_cudnn():
 
 
 if __name__ == "__main__":
-    if not os.path.exists('external'):
-        os.makedirs('external')
+    if not os.path.exists("external"):
+        os.makedirs("external")
     else:
-        assert os.path.isdir('external'), "external exists but is not a directory"
+        assert os.path.isdir("external"), "external exists but is not a directory"
 
     # Use new C++ standard for newer NVCC versions
     cuda_home = cpp_extension.CUDA_HOME
     cuda_version = None
     if cuda_home is not None:
-        cuda_version_str = subprocess.check_output([cuda_home + "/bin/nvcc", '--version']).strip().decode()
-        cuda_version = re.search(r'release (\d+[.]\d+)', cuda_version_str)
+        cuda_version_str = subprocess.check_output([cuda_home + "/bin/nvcc", "--version"]).strip().decode()
+        cuda_version = re.search(r"release (\d+[.]\d+)", cuda_version_str)
         if cuda_version is not None:
             cuda_version = cuda_version.group(1)
 
-    if cuda_version is not None and int(cuda_version.split('.')[0]) >= 12:
+    if cuda_version is not None and int(cuda_version.split(".")[0]) >= 12:
         cpp_std = "c++20"
         cudnn_include_dirs, cudnn_static_libs = download_and_install_cudnn()
     else:
@@ -247,54 +254,66 @@ def download_and_install_cudnn():
 
     cwd = os.path.dirname(os.path.abspath(__file__))
     cpp_flags = [
-        f'-std={cpp_std}',
-        '-Wno-unknown-pragmas',
-        '-Wno-class-memaccess',
-        '-fdiagnostics-color=always',
-        '-DNANOVDB_USE_BLOSC',
+        f"-std={cpp_std}",
+        "-Wno-unknown-pragmas",
+        "-Wno-class-memaccess",
+        "-fdiagnostics-color=always",
+        "-DNANOVDB_USE_BLOSC",
+    ]
+    nvcc_flags = [
+        f"-std={cpp_std}",
+        "--extended-lambda",
+        "--diag-suppress=186",
+        "-diag-suppress=3189",
     ]
-    nvcc_flags = [f'-std={cpp_std}', '--extended-lambda', '--diag-suppress=186', '-diag-suppress=3189']
     user_nvcc_flags = os.getenv("NVCC_FLAGS", "").split()
     nvcc_flags += user_nvcc_flags
 
     lib_ext = cpp_extension.CUDAExtension(
-        name='fvdb.fvdblib',
-        sources=get_source_files_recursive('src', include_bindings=False),
-        include_dirs=[os.path.join(cwd, 'src'),
-                    os.path.join(cwd, get_nanovdb_source_dir()),
-                    os.path.join(cwd, 'external/cutlass/include'),
-                    os.path.join(cwd, 'external/c-blosc/install/include'),
-                    os.path.join(cwd, 'external/cudnn_fe/include')] + cudnn_include_dirs,
-        extra_objects=['external/c-blosc/install/lib/libblosc.a'] + cudnn_static_libs,
-        extra_compile_args={'cxx': cpp_flags + ['-fvisibility=default'],
-                            'nvcc': nvcc_flags},
-        language='c++')
+        name="fvdb.fvdblib",
+        sources=get_source_files_recursive("src", include_bindings=False),
+        include_dirs=[
+            os.path.join(cwd, "src"),
+            os.path.join(cwd, get_nanovdb_source_dir()),
+            os.path.join(cwd, "external/cutlass/include"),
+            os.path.join(cwd, "external/c-blosc/install/include"),
+            os.path.join(cwd, "external/cudnn_fe/include"),
+        ]
+        + cudnn_include_dirs,
+        extra_objects=["external/c-blosc/install/lib/libblosc.a"] + cudnn_static_libs,
+        extra_compile_args={"cxx": cpp_flags + ["-fvisibility=default"], "nvcc": nvcc_flags},
+        language="c++",
+    )
 
     bind_ext = cpp_extension.CUDAExtension(
-        name='fvdb._Cpp',
-        sources=get_source_files_recursive('src/python/'),
-        include_dirs=[os.path.join(cwd, 'src'),
-                    os.path.join(cwd, get_nanovdb_source_dir()),
-                    os.path.join(cwd, 'external/cutlass/include'),
-                    os.path.join(cwd, 'external/c-blosc/install/include')],
-        library_dirs=[os.path.join(cwd, 'fvdb')],
-        libraries=['fvdb'],
-        extra_link_args=['-Wl,-rpath,$ORIGIN'],
-        extra_compile_args={'cxx': cpp_flags + ['-fvisibility=hidden'],
-                            'nvcc': nvcc_flags},
-        language='c++')
-
-    def retrieve_version(file_path = "fvdb/__init__.py"):
+        name="fvdb._Cpp",
+        sources=get_source_files_recursive("src/python/"),
+        include_dirs=[
+            os.path.join(cwd, "src"),
+            os.path.join(cwd, get_nanovdb_source_dir()),
+            os.path.join(cwd, "external/cutlass/include"),
+            os.path.join(cwd, "external/c-blosc/install/include"),
+        ],
+        library_dirs=[os.path.join(cwd, "fvdb")],
+        libraries=["fvdb"],
+        extra_link_args=["-Wl,-rpath,$ORIGIN"],
+        extra_compile_args={"cxx": cpp_flags + ["-fvisibility=hidden"], "nvcc": nvcc_flags},
+        language="c++",
+    )
+
+    def retrieve_version(file_path="fvdb/__init__.py"):
         with open(file_path, "r") as f:
             for line in f:
                 if line.startswith("__version__"):
                     return line.split("=")[1].strip().strip("'").strip('"')
         return "0.0.0"
 
-    setup(name='fvdb',
-        version = retrieve_version(),
+    setup(
+        name="fvdb",
+        version=retrieve_version(),
         ext_modules=[lib_ext, bind_ext],
-        packages=['fvdb', 'fvdb.nn', 'fvdb.utils'],
+        packages=["fvdb", "fvdb.nn", "fvdb.utils"],
         include_package_data=True,
-        package_data={'fvdb': ['_Cpp.pyi', 'py.typed']},
-        cmdclass={'build_ext': FVDBBuildCommand})
+        package_data={"fvdb": ["_Cpp.pyi", "py.typed"]},
+        cmdclass={"build_ext": FVDBBuildCommand},
+    )
diff --git a/fvdb/src/Config.cpp b/fvdb/src/Config.cpp
index c0027f498e..8554a909c3 100644
--- a/fvdb/src/Config.cpp
+++ b/fvdb/src/Config.cpp
@@ -7,23 +7,28 @@ namespace fvdb {
 
 Config::Config() = default;
 
-Config& Config::global() {
+Config &
+Config::global() {
     static Config _config;
     return _config;
 }
 
-void Config::setUltraSparseAcceleration(bool enabled) {
+void
+Config::setUltraSparseAcceleration(bool enabled) {
     mUltraSparseAcceleration = enabled;
 }
 
-bool Config::ultraSparseAccelerationEnabled() const {
+bool
+Config::ultraSparseAccelerationEnabled() const {
     return mUltraSparseAcceleration;
 }
 
-void Config::setPendanticErrorChecking(bool enabled) {
+void
+Config::setPendanticErrorChecking(bool enabled) {
     mPendanticErrorChecking = enabled;
 }
-bool Config::pendanticErrorCheckingEnabled() const {
+bool
+Config::pendanticErrorCheckingEnabled() const {
     return mPendanticErrorChecking;
 }
 
diff --git a/fvdb/src/Config.h b/fvdb/src/Config.h
index 056cd5fc41..6c0677a5d8 100644
--- a/fvdb/src/Config.h
+++ b/fvdb/src/Config.h
@@ -1,14 +1,13 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
+#ifndef FVDB_CONFIG_H
+#define FVDB_CONFIG_H
 
 namespace fvdb {
 
 class Config {
-
-public:
+  public:
     Config();
 
     void setUltraSparseAcceleration(bool enabled);
@@ -17,11 +16,13 @@ class Config {
     void setPendanticErrorChecking(bool enabled);
     bool pendanticErrorCheckingEnabled() const;
 
-    static Config& global();
+    static Config &global();
 
-private:
+  private:
     bool mUltraSparseAcceleration = false;
-    bool mPendanticErrorChecking = false;
+    bool mPendanticErrorChecking  = false;
 };
 
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_CONFIG_H
\ No newline at end of file
diff --git a/fvdb/src/FVDB.cpp b/fvdb/src/FVDB.cpp
index c77318ac83..955aea8844 100644
--- a/fvdb/src/FVDB.cpp
+++ b/fvdb/src/FVDB.cpp
@@ -13,19 +13,18 @@
 
 namespace fvdb {
 
-std::vector<torch::Tensor> volumeRender(const torch::Tensor& sigmas, const torch::Tensor& rgbs,
-                                        const torch::Tensor& deltaTs, const torch::Tensor& ts,
-                                        const torch::Tensor& jOffsets, double transmittanceThresh) {
-    return detail::autograd::VolumeRender::apply(sigmas, rgbs, deltaTs, ts, jOffsets, transmittanceThresh);
+std::vector<torch::Tensor>
+volumeRender(const torch::Tensor &sigmas, const torch::Tensor &rgbs, const torch::Tensor &deltaTs,
+             const torch::Tensor &ts, const torch::Tensor &jOffsets, double transmittanceThresh) {
+    return detail::autograd::VolumeRender::apply(sigmas, rgbs, deltaTs, ts, jOffsets,
+                                                 transmittanceThresh);
 }
 
-JaggedTensor scaledDotProductAttention(const JaggedTensor& query,
-                                       const JaggedTensor& key,
-                                       const JaggedTensor& value,
-                                       float scale) {
-
-    cudaDeviceProp* p = at::cuda::getDeviceProperties(query.device().index());
-    const int computeCapability = p->major * 10 + p->minor;
+JaggedTensor
+scaledDotProductAttention(const JaggedTensor &query, const JaggedTensor &key,
+                          const JaggedTensor &value, float scale) {
+    cudaDeviceProp *p                 = at::cuda::getDeviceProperties(query.device().index());
+    const int       computeCapability = p->major * 10 + p->minor;
 
     if (computeCapability < 90) {
         // https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -33,134 +32,126 @@ JaggedTensor scaledDotProductAttention(const JaggedTensor& query,
         // - key: (N, ..., S, E)
         // - value: (N, ..., S, V)
         std::vector<torch::Tensor> outList;
-        torch::Tensor qOffsets = query.joffsets().cpu();
-        torch::Tensor kvOffsets = key.joffsets().cpu();
+        torch::Tensor              qOffsets  = query.joffsets().cpu();
+        torch::Tensor              kvOffsets = key.joffsets().cpu();
 
         for (int64_t b = 0; b < query.num_tensors(); ++b) {
-            int64_t qStart = qOffsets[b].item<int64_t>();
-            int64_t qEnd = qOffsets[b+1].item<int64_t>();
+            int64_t qStart  = qOffsets[b].item<int64_t>();
+            int64_t qEnd    = qOffsets[b + 1].item<int64_t>();
             int64_t kvStart = kvOffsets[b].item<int64_t>();
-            int64_t kvEnd = kvOffsets[b+1].item<int64_t>();
-
-            torch::Tensor q = query.jdata().index({torch::indexing::Slice(qStart, qEnd)}).permute({1, 0, 2});
-            torch::Tensor k = key.jdata().index({torch::indexing::Slice(kvStart, kvEnd)}).permute({1, 0, 2});
-            torch::Tensor v = value.jdata().index({torch::indexing::Slice(kvStart, kvEnd)}).permute({1, 0, 2});
-
-            torch::Tensor out = at::native::scaled_dot_product_attention(q, k, v, {}, 0.0, false, scale);
-            outList.push_back(out.permute({1, 0, 2}));
+            int64_t kvEnd   = kvOffsets[b + 1].item<int64_t>();
+
+            torch::Tensor q =
+                query.jdata().index({ torch::indexing::Slice(qStart, qEnd) }).permute({ 1, 0, 2 });
+            torch::Tensor k =
+                key.jdata().index({ torch::indexing::Slice(kvStart, kvEnd) }).permute({ 1, 0, 2 });
+            torch::Tensor v = value.jdata()
+                                  .index({ torch::indexing::Slice(kvStart, kvEnd) })
+                                  .permute({ 1, 0, 2 });
+
+            torch::Tensor out =
+                at::native::scaled_dot_product_attention(q, k, v, {}, 0.0, false, scale);
+            outList.push_back(out.permute({ 1, 0, 2 }));
         }
 
         return JaggedTensor(outList);
     }
 
     // Custom implementation with CUDNN is only available for Hopper.
-    torch::Tensor qLengths = query.joffsets().index({torch::indexing::Slice(1, query.num_tensors())});
-    torch::Tensor kvLengths = key.joffsets().index({torch::indexing::Slice(1, query.num_tensors())});
+    torch::Tensor qLengths =
+        query.joffsets().index({ torch::indexing::Slice(1, query.num_tensors()) });
+    torch::Tensor kvLengths =
+        key.joffsets().index({ torch::indexing::Slice(1, query.num_tensors()) });
     torch::Tensor res = detail::autograd::Attention::apply(
         query.jdata(), key.jdata(), value.jdata(), qLengths, kvLengths, scale)[0];
     return query.jagged_like(res);
 }
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-from_nanovdb(nanovdb::GridHandle<nanovdb::HostBuffer>& handle){
+from_nanovdb(nanovdb::GridHandle<nanovdb::HostBuffer> &handle) {
     return detail::io::fromNVDB(handle);
 }
 
 nanovdb::GridHandle<nanovdb::HostBuffer>
-to_nanovdb(const GridBatch& gridBatch,
-           const torch::optional<JaggedTensor> maybeData,
-           const torch::optional<StringOrListOfStrings> maybeNames){
+to_nanovdb(const GridBatch &gridBatch, const torch::optional<JaggedTensor> maybeData,
+           const torch::optional<StringOrListOfStrings> maybeNames) {
     return detail::io::toNVDB(gridBatch, maybeData, maybeNames);
 }
 
-
-GridBatch jcat(const std::vector<GridBatch>& vec) {
-     std::vector<c10::intrusive_ptr<detail::GridBatchImpl>> vecHdls;
-     std::transform(vec.begin(), vec.end(), std::back_inserter(vecHdls),
-                    [](const GridBatch& grid) { return grid.impl(); });
-     return GridBatch(detail::GridBatchImpl::concatenate(vecHdls));
+GridBatch
+jcat(const std::vector<GridBatch> &vec) {
+    std::vector<c10::intrusive_ptr<detail::GridBatchImpl>> vecHdls;
+    std::transform(vec.begin(), vec.end(), std::back_inserter(vecHdls),
+                   [](const GridBatch &grid) { return grid.impl(); });
+    return GridBatch(detail::GridBatchImpl::concatenate(vecHdls));
 }
 
-JaggedTensor jcat(const std::vector<JaggedTensor>& vec, torch::optional<int64_t> dim) {
+JaggedTensor
+jcat(const std::vector<JaggedTensor> &vec, torch::optional<int64_t> dim) {
     return JaggedTensor::jcat(vec, dim);
 }
 
-void save(const std::string& path,
-          const GridBatch& gridBatch,
-          const torch::optional<JaggedTensor> maybeData,
-          const torch::optional<StringOrListOfStrings> maybeNames,
-          bool compressed,
-          bool verbose) {
+void
+save(const std::string &path, const GridBatch &gridBatch,
+     const torch::optional<JaggedTensor>          maybeData,
+     const torch::optional<StringOrListOfStrings> maybeNames, bool compressed, bool verbose) {
     detail::io::saveNVDB(path, gridBatch, maybeData, maybeNames, compressed, verbose);
 }
 
-
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-load(const std::string& path,
-     NanoVDBFileGridIdentifier gridIdentifier,
-     TorchDeviceOrString device,
+load(const std::string &path, NanoVDBFileGridIdentifier gridIdentifier, TorchDeviceOrString device,
      bool verbose) {
     return detail::io::loadNVDB(path, gridIdentifier, device, verbose);
 }
 
-GridBatch sparse_grid_from_points(const JaggedTensor& points,
-                                  const Vec3i& pad_min,
-                                  const Vec3i& pad_max,
-                                  const Vec3dBatchOrScalar& voxel_sizes,
-                                  const Vec3dBatch& origins,
-                                  bool is_mutable) {
+GridBatch
+sparse_grid_from_points(const JaggedTensor &points, const Vec3i &pad_min, const Vec3i &pad_max,
+                        const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins,
+                        bool is_mutable) {
     auto ret = GridBatch(points.device(), is_mutable);
     ret.set_from_points(points, pad_min, pad_max, voxel_sizes, origins);
     return ret;
 }
 
-
-GridBatch sparse_grid_from_ijk(const JaggedTensor& ijk,
-                               const Vec3i& pad_min,
-                               const Vec3i& pad_max,
-                               const Vec3dBatchOrScalar& voxel_sizes,
-                               const Vec3dBatch& origins,
-                               bool is_mutable) {
+GridBatch
+sparse_grid_from_ijk(const JaggedTensor &ijk, const Vec3i &pad_min, const Vec3i &pad_max,
+                     const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins,
+                     bool is_mutable) {
     auto ret = GridBatch(ijk.device(), is_mutable);
     ret.set_from_ijk(ijk, pad_min, pad_max, voxel_sizes, origins);
     return ret;
 }
 
-
-GridBatch sparse_grid_from_nearest_voxels_to_points(const JaggedTensor& points,
-                                                    const Vec3dBatchOrScalar& voxel_sizes,
-                                                    const Vec3dBatch& origins,
-                                                    bool is_mutable) {
+GridBatch
+sparse_grid_from_nearest_voxels_to_points(const JaggedTensor       &points,
+                                          const Vec3dBatchOrScalar &voxel_sizes,
+                                          const Vec3dBatch &origins, bool is_mutable) {
     auto ret = GridBatch(points.device(), is_mutable);
     ret.set_from_nearest_voxels_to_points(points, voxel_sizes, origins);
     return ret;
 }
 
-
-GridBatch sparse_grid_from_dense(const int64_t numGrids,
-                                 const Vec3i& denseDims,
-                                 const Vec3i& ijkMin,
-                                 const Vec3dBatchOrScalar& voxel_sizes,
-                                 const Vec3dBatch& origins,
-                                 torch::optional<torch::Tensor> mask,
-                                 TorchDeviceOrString device, bool is_mutable) {
+GridBatch
+sparse_grid_from_dense(const int64_t numGrids, const Vec3i &denseDims, const Vec3i &ijkMin,
+                       const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins,
+                       torch::optional<torch::Tensor> mask, TorchDeviceOrString device,
+                       bool is_mutable) {
     auto ret = GridBatch(device, is_mutable);
     ret.set_from_dense_grid(numGrids, denseDims, ijkMin, voxel_sizes, origins, mask);
     return ret;
 }
 
-GridBatch sparse_grid_from_mesh(const JaggedTensor& vertices,
-                                const JaggedTensor& faces,
-                                const Vec3dBatchOrScalar& voxel_sizes,
-                                const Vec3dBatch& origins,
-                                bool is_mutable) {
+GridBatch
+sparse_grid_from_mesh(const JaggedTensor &vertices, const JaggedTensor &faces,
+                      const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins,
+                      bool is_mutable) {
     auto ret = GridBatch(vertices.device(), is_mutable);
     ret.set_from_mesh(vertices, faces, voxel_sizes, origins);
     return ret;
 }
 
-
-std::vector<int64_t> jdataShape1(const std::vector<int64_t>& lsizes, const std::vector<int64_t>& rsizes) {
+std::vector<int64_t>
+jdataShape1(const std::vector<int64_t> &lsizes, const std::vector<int64_t> &rsizes) {
     const int64_t totalElements = std::reduce(std::execution::par, lsizes.begin(), lsizes.end());
     std::vector<int64_t> shape;
     shape.reserve(rsizes.size() + 1);
@@ -169,17 +160,20 @@ std::vector<int64_t> jdataShape1(const std::vector<int64_t>& lsizes, const std::
     return shape;
 }
 
-std::tuple<int64_t, std::vector<int64_t>> jdataShape2(const std::vector<std::vector<int64_t>>& lsizes, const std::vector<int64_t>& rsizes) {
+std::tuple<int64_t, std::vector<int64_t>>
+jdataShape2(const std::vector<std::vector<int64_t>> &lsizes, const std::vector<int64_t> &rsizes) {
     std::vector<int64_t> elementCountsPerList;
     std::vector<int64_t> tensorCountsPerList;
     elementCountsPerList.reserve(lsizes.size());
     tensorCountsPerList.reserve(lsizes.size());
-    for (const auto& l : lsizes) {
+    for (const auto &l: lsizes) {
         elementCountsPerList.push_back(std::reduce(std::execution::par, l.begin(), l.end()));
         tensorCountsPerList.push_back(l.size());
     }
-    const int64_t totalSize = std::reduce(std::execution::par, elementCountsPerList.begin(), elementCountsPerList.end());
-    const int64_t totalTensors = std::reduce(std::execution::par, tensorCountsPerList.begin(), tensorCountsPerList.end());
+    const int64_t totalSize =
+        std::reduce(std::execution::par, elementCountsPerList.begin(), elementCountsPerList.end());
+    const int64_t totalTensors =
+        std::reduce(std::execution::par, tensorCountsPerList.begin(), tensorCountsPerList.end());
     std::vector<int64_t> shape;
     shape.reserve(rsizes.size() + 1);
     shape.push_back(totalSize);
@@ -188,19 +182,17 @@ std::tuple<int64_t, std::vector<int64_t>> jdataShape2(const std::vector<std::vec
     return std::make_tuple(totalTensors, shape);
 }
 
-#define __FVDB__BUILDER(FNAME, JFNAME) \
-    JaggedTensor JFNAME(const std::vector<int64_t>& lsizes, \
-                        const std::vector<int64_t> rsizes, \
-                        at::TensorOptions options) { \
-        auto shape = jdataShape1(lsizes, rsizes); \
-        return JaggedTensor(lsizes, FNAME(shape, options)); \
-    } \
-    \
-    JaggedTensor JFNAME(const std::vector<std::vector<int64_t>>& lsizes, \
-                        const std::vector<int64_t> rsizes, \
-                        at::TensorOptions options) { \
-        auto shape = jdataShape2(lsizes, rsizes); \
-        return JaggedTensor(lsizes, std::get<0>(shape), FNAME(std::get<1>(shape), options)); \
+#define __FVDB__BUILDER(FNAME, JFNAME)                                                         \
+    JaggedTensor JFNAME(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes, \
+                        at::TensorOptions options) {                                           \
+        auto shape = jdataShape1(lsizes, rsizes);                                              \
+        return JaggedTensor(lsizes, FNAME(shape, options));                                    \
+    }                                                                                          \
+                                                                                               \
+    JaggedTensor JFNAME(const std::vector<std::vector<int64_t>> &lsizes,                       \
+                        const std::vector<int64_t> rsizes, at::TensorOptions options) {        \
+        auto shape = jdataShape2(lsizes, rsizes);                                              \
+        return JaggedTensor(lsizes, std::get<0>(shape), FNAME(std::get<1>(shape), options));   \
     }
 
 __FVDB__BUILDER(torch::rand, jrand)
diff --git a/fvdb/src/FVDB.h b/fvdb/src/FVDB.h
index a0052da2c1..e41c27cc07 100644
--- a/fvdb/src/FVDB.h
+++ b/fvdb/src/FVDB.h
@@ -1,37 +1,37 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_FVDB_H
+#define FVDB_FVDB_H
 
-#include <torch/all.h>
-
-#include "JaggedTensor.h"
 #include "GridBatch.h"
+#include "JaggedTensor.h"
 #include "SparseConvPackInfo.h"
 #include "Types.h"
 
+#include <torch/all.h>
+
 namespace fvdb {
 
-std::vector<torch::Tensor> volumeRender(const torch::Tensor& sigmas, const torch::Tensor& rgbs,
-                                        const torch::Tensor& deltaTs, const torch::Tensor& ts,
-                                        const torch::Tensor& packInfo, double transmittanceThresh);
+std::vector<torch::Tensor> volumeRender(const torch::Tensor &sigmas, const torch::Tensor &rgbs,
+                                        const torch::Tensor &deltaTs, const torch::Tensor &ts,
+                                        const torch::Tensor &packInfo, double transmittanceThresh);
 
-JaggedTensor scaledDotProductAttention(const JaggedTensor& query,
-                                       const JaggedTensor& key,
-                                       const JaggedTensor& value,
-                                       float scale);
+JaggedTensor scaledDotProductAttention(const JaggedTensor &query, const JaggedTensor &key,
+                                       const JaggedTensor &value, float scale);
 
 /// @brief Concatenate a list of grid batches into a single grid batch
 /// @param vec A list of grid batches to concatenate
 /// @return A GridBatch representing the concatenated grid batch
-GridBatch jcat(const std::vector<GridBatch>& vec);
+GridBatch jcat(const std::vector<GridBatch> &vec);
 
 /// @brief Concatenate a list of JaggedTensor into a single JaggedTensor
 /// @param vec A list of JaggedTensor to concatenate
-/// @param dim The dimension to concatenate along or nullptr to concatenate the outermost tensor lists
+/// @param dim The dimension to concatenate along or nullptr to concatenate the outermost tensor
+/// lists
 /// @return A JaggedTensor representing the concatenated JaggedTensor
-JaggedTensor jcat(const std::vector<JaggedTensor>& vec, torch::optional<int64_t> dim = torch::nullopt);
-
+JaggedTensor jcat(const std::vector<JaggedTensor> &vec,
+                  torch::optional<int64_t>         dim = torch::nullopt);
 
 /// @brief Create a JaggedTensor filled with random numbers from a uniform distribution
 ///        on the interval [0, 1) with the specified lshape an rshape
@@ -39,12 +39,10 @@ JaggedTensor jcat(const std::vector<JaggedTensor>& vec, torch::optional<int64_t>
 /// @param rsizes The rshape of the JaggedTensor (feature dimension of each tensor)
 /// @param options The options to use for the created tensor
 /// @return A JaggedTensor filled with random numbers from the uniform distribution on [0, 1).
-JaggedTensor jrand(const std::vector<int64_t>& lsizes,
-                   const std::vector<int64_t> rsizes = {},
-                   at::TensorOptions options = {});
-JaggedTensor jrand(const std::vector<std::vector<int64_t>>& lsizes,
-                   const std::vector<int64_t> rsizes = {},
+JaggedTensor jrand(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes = {},
                    at::TensorOptions options = {});
+JaggedTensor jrand(const std::vector<std::vector<int64_t>> &lsizes,
+                   const std::vector<int64_t> rsizes = {}, at::TensorOptions options = {});
 
 /// @brief Create a JaggedTensor filled with random numbers from a normal distribution
 ///        with mean 0 and variance 1 (also called the standard normal distribution).
@@ -52,183 +50,192 @@ JaggedTensor jrand(const std::vector<std::vector<int64_t>>& lsizes,
 /// @param rsizes The rshape of the JaggedTensor (feature dimension of each tensor)
 /// @param options The options to use for the created tensor
 /// @return A JaggedTensor filled with random numbers from the standard normal distribution.
-JaggedTensor jrandn(const std::vector<int64_t>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
-                    at::TensorOptions options = {});
-JaggedTensor jrandn(const std::vector<std::vector<int64_t>>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
+JaggedTensor jrandn(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes = {},
                     at::TensorOptions options = {});
+JaggedTensor jrandn(const std::vector<std::vector<int64_t>> &lsizes,
+                    const std::vector<int64_t> rsizes = {}, at::TensorOptions options = {});
 
 /// @brief Create a JaggedTensor filled with zeros.
 /// @param lsizes The lshape of the JaggedTensor (number of elements per tensor)
 /// @param rsizes The rshape of the JaggedTensor (feature dimension of each tensor)
 /// @param options The options to use for the created tensor
 /// @return A JaggedTensor filled with zeros.
-JaggedTensor jzeros(const std::vector<int64_t>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
-                    at::TensorOptions options = {});
-JaggedTensor jzeros(const std::vector<std::vector<int64_t>>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
+JaggedTensor jzeros(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes = {},
                     at::TensorOptions options = {});
+JaggedTensor jzeros(const std::vector<std::vector<int64_t>> &lsizes,
+                    const std::vector<int64_t> rsizes = {}, at::TensorOptions options = {});
 
 /// @brief Create a JaggedTensor filled with ones.
 /// @param lsizes The lshape of the JaggedTensor (number of elements per tensor)
 /// @param rsizes The rshape of the JaggedTensor (feature dimension of each tensor)
 /// @param options The options to use for the created tensor
 /// @return A JaggedTensor filled with ones.
-JaggedTensor jones(const std::vector<int64_t>& lsizes,
-                   const std::vector<int64_t> rsizes = {},
-                   at::TensorOptions options = {});
-JaggedTensor jones(const std::vector<std::vector<int64_t>>& lsizes,
-                   const std::vector<int64_t> rsizes = {},
+JaggedTensor jones(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes = {},
                    at::TensorOptions options = {});
+JaggedTensor jones(const std::vector<std::vector<int64_t>> &lsizes,
+                   const std::vector<int64_t> rsizes = {}, at::TensorOptions options = {});
 
 /// @brief Create an empty JaggedTensor with uninitialized values.
 /// @param lsizes The lshape of the JaggedTensor (number of elements per tensor)
 /// @param rsizes The rshape of the JaggedTensor (feature dimension of each tensor)
 /// @param options The options to use for the created tensor
 /// @return A JaggedTensor filled with uninitialized values.
-JaggedTensor jempty(const std::vector<int64_t>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
-                    at::TensorOptions options = {});
-JaggedTensor jempty(const std::vector<std::vector<int64_t>>& lsizes,
-                    const std::vector<int64_t> rsizes = {},
+JaggedTensor jempty(const std::vector<int64_t> &lsizes, const std::vector<int64_t> rsizes = {},
                     at::TensorOptions options = {});
+JaggedTensor jempty(const std::vector<std::vector<int64_t>> &lsizes,
+                    const std::vector<int64_t> rsizes = {}, at::TensorOptions options = {});
 
 /// @brief Return a grid batch with voxels which contain a point in an input set of point clouds
 ///        (possibly padding each voxel containing a point)
 /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to create
-/// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the left/back/bottom
-/// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the right/front/top
-/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+/// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel
+/// with to the left/back/bottom
+/// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel
+/// with to the right/front/top
+/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in
+/// the batch or one voxel size for all grids
+/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0,
+/// 0, 0] voxel
 ///                for each grid in the batch, or one origin for all grids
 /// @param is_mutable Whether the grid should be mutable or not
 /// @return A GridBatch containing the created grid batch
-GridBatch sparse_grid_from_points(const JaggedTensor& points,
-                                  const Vec3i& pad_min = torch::zeros({3}, torch::kInt32),
-                                  const Vec3i& pad_max = torch::zeros({3}, torch::kInt32),
-                                  const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                                  const Vec3dBatch& origins = torch::zeros({3}),
-                                  bool is_mutable = false);
-
-
-/// @brief Return a grid batch with the eight nearest voxels to each point in an input set of point clouds
+GridBatch sparse_grid_from_points(const JaggedTensor &points,
+                                  const Vec3i        &pad_min = torch::zeros({ 3 }, torch::kInt32),
+                                  const Vec3i        &pad_max = torch::zeros({ 3 }, torch::kInt32),
+                                  const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                                  const Vec3dBatch         &origins     = torch::zeros({ 3 }),
+                                  bool                      is_mutable  = false);
+
+/// @brief Return a grid batch with the eight nearest voxels to each point in an input set of point
+/// clouds
 /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to create
-/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in
+/// the batch or one voxel size for all grids
+/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0,
+/// 0, 0] voxel
 ///                     for each grid in the batch, or one origin for all grids
 /// @param is_mutable Whether the grid should be mutable or not
 /// @return A GridBatch containing the created grid batch
-GridBatch sparse_grid_from_nearest_voxels_to_points(const JaggedTensor& points,
-                                                    const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                                                    const Vec3dBatch& origins = torch::zeros({3}),
-                                                    bool is_mutable = false);
-
+GridBatch sparse_grid_from_nearest_voxels_to_points(const JaggedTensor       &points,
+                                                    const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                                                    const Vec3dBatch &origins = torch::zeros({ 3 }),
+                                                    bool              is_mutable = false);
 
 /// @brief REturn a grid batch with the specified voxel coordinates (possibly with padding)
-/// @param coords A JaggedTensor of shape [B, -1, 3] specifying the coordinates of each voxel to insert
-/// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the left/back/bottom
-/// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the right/front/top
-/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+/// @param coords A JaggedTensor of shape [B, -1, 3] specifying the coordinates of each voxel to
+/// insert
+/// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel
+/// with to the left/back/bottom
+/// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel
+/// with to the right/front/top
+/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in
+/// the batch or one voxel size for all grids
+/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0,
+/// 0, 0] voxel
 ///                for each grid in the batch, or one origin for all grids
 /// @return A GridBatch containing the created grid batch
-GridBatch sparse_grid_from_ijk(const JaggedTensor& ijk,
-                               const Vec3i& pad_min = torch::zeros({3}, torch::kInt32),
-                               const Vec3i& pad_max = torch::zeros({3}, torch::kInt32),
-                               const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                               const Vec3dBatch& origins = torch::zeros({3}),
-                               bool is_mutable = false);
-
+GridBatch sparse_grid_from_ijk(const JaggedTensor &ijk,
+                               const Vec3i        &pad_min = torch::zeros({ 3 }, torch::kInt32),
+                               const Vec3i        &pad_max = torch::zeros({ 3 }, torch::kInt32),
+                               const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                               const Vec3dBatch         &origins     = torch::zeros({ 3 }),
+                               bool                      is_mutable  = false);
 
 /// @brief Return a grid batch densely from ijkMin to ijkMin + size
 /// @param numGrids The number of grids to create in the batch
 /// @param denseDims The size of each dense grid (shape [3,] = [W, H, D])
 /// @param ijkMin The minimum ijk coordinate of each dense grid in the batch (shape [3,])
-/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in
+/// the batch or one voxel size for all grids
+/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0,
+/// 0, 0] voxel
 ///                     for each grid in the batch, or one origin for all grids
-/// @param mask Optional mask of shape [W, H, D] to specify voxels which are included in the dense grid.
+/// @param mask Optional mask of shape [W, H, D] to specify voxels which are included in the dense
+/// grid.
 ///             Note that the same mask will be re-used for all the grids in the batch.
 /// @param device Which device to build the grid batch on
 /// @param mutable If the returned grid batch should be mutable
 /// @return A GridBatch containing a batch of dense grids
-GridBatch sparse_grid_from_dense(const int64_t numGrids,
-                                 const Vec3i& denseDims,
-                                 const Vec3i& ijkMin,
-                                 const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                                 const Vec3dBatch& origins = torch::zeros({3}),
-                                 torch::optional<torch::Tensor> mask = torch::nullopt,
-                                 TorchDeviceOrString device = torch::kCPU,
-                                 bool is_mutable = false);
-
-
-/// @brief Return a grid batch from a jagged batch of triangle meshes (i.e. each voxel intersects the mesh)
-/// @param vertices A JaggedTensor of shape [B, -1, 3] containing the vertices of each mesh in the batch
+GridBatch sparse_grid_from_dense(const int64_t numGrids, const Vec3i &denseDims,
+                                 const Vec3i &ijkMin, const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                                 const Vec3dBatch              &origins = torch::zeros({ 3 }),
+                                 torch::optional<torch::Tensor> mask    = torch::nullopt,
+                                 TorchDeviceOrString device = torch::kCPU, bool is_mutable = false);
+
+/// @brief Return a grid batch from a jagged batch of triangle meshes (i.e. each voxel intersects
+/// the mesh)
+/// @param vertices A JaggedTensor of shape [B, -1, 3] containing the vertices of each mesh in the
+/// batch
 /// @param faces A JaggedTensor of shape [B, -1, 3] containing the faces of each mesh in the batch
-/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+/// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in
+/// the batch or one voxel size for all grids
+/// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0,
+/// 0, 0] voxel
 ///                for each grid in the batch, or one origin for all grids
 /// @param is_mutable Whether the grid should be mutable or not
 /// @return A GridBatch containing the created grid batch
-GridBatch sparse_grid_from_mesh(const JaggedTensor& vertices,
-                                const JaggedTensor& faces,
-                                const Vec3dBatchOrScalar& voxel_sizes,
-                                const Vec3dBatch& origins,
+GridBatch sparse_grid_from_mesh(const JaggedTensor &vertices, const JaggedTensor &faces,
+                                const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins,
                                 bool is_mutable);
 
 /// @brief Return a grid batch, tensors of data, and names from a nanovdb grid handle
 /// @param handle nanovdb grid handle
-/// @return A triple (gridbatch, data, names) where gridbatch is a GridBatch containing the converted grids,
-///         data is a JaggedTensor containing the data of the grids, and names is a list of strings containing
-///         the name of each grid
+/// @return A triple (gridbatch, data, names) where gridbatch is a GridBatch containing the
+/// converted grids,
+///         data is a JaggedTensor containing the data of the grids, and names is a list of strings
+///         containing the name of each grid
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-from_nanovdb(nanovdb::GridHandle<nanovdb::HostBuffer>& handle);
-
+from_nanovdb(nanovdb::GridHandle<nanovdb::HostBuffer> &handle);
 
-/// @brief Return a nanovdb grid handle created from a grid batch, optional jagged tensor of data, and optional
+/// @brief Return a nanovdb grid handle created from a grid batch, optional jagged tensor of data,
+/// and optional
 ///        list of names
 /// @param gridBatch The gridbatch to convert
-/// @param maybeData Optional JaggedTensor of data to save with the grid batch (one element per voxel)
-/// @param maybeNames  Optional list of names for each grid in the batch (or a single name to use for every grid)
-/// @return A nanovdb grid handle, whose type is inferred from the data, containing the converted grids
+/// @param maybeData Optional JaggedTensor of data to save with the grid batch (one element per
+/// voxel)
+/// @param maybeNames  Optional list of names for each grid in the batch (or a single name to use
+/// for every grid)
+/// @return A nanovdb grid handle, whose type is inferred from the data, containing the converted
+/// grids
 nanovdb::GridHandle<nanovdb::HostBuffer>
-to_nanovdb(const GridBatch& gridBatch,
-           const torch::optional<JaggedTensor> maybeData = torch::optional<JaggedTensor>(),
-           const torch::optional<StringOrListOfStrings> maybeNames = torch::optional<StringOrListOfStrings>());
-
+to_nanovdb(const GridBatch                             &gridBatch,
+           const torch::optional<JaggedTensor>          maybeData = torch::optional<JaggedTensor>(),
+           const torch::optional<StringOrListOfStrings> maybeNames =
+               torch::optional<StringOrListOfStrings>());
 
-/// @brief Save a grid batch and optional jagged tensor to a .nvdb file. Will overwrite existing files.
+/// @brief Save a grid batch and optional jagged tensor to a .nvdb file. Will overwrite existing
+/// files.
 /// @param path The path to save the file to.
 /// @param gridBatch The gridbatch to save
-/// @param maybeData Optional JaggedTensor of data to save with the grid batch (one element per voxel)
-/// @param maybeNames Optional list of names for each grid in the batch (or a single name to use for every grid)
+/// @param maybeData Optional JaggedTensor of data to save with the grid batch (one element per
+/// voxel)
+/// @param maybeNames Optional list of names for each grid in the batch (or a single name to use for
+/// every grid)
 /// @param compressed Whether to compress the stored grid using Blosc (https://www.blosc.org/)
 /// @param verbose Whether to print information about the saved grids
-void save(const std::string& path,
-          const GridBatch& gridBatch,
-          const torch::optional<JaggedTensor> maybeData = torch::optional<JaggedTensor>(),
-          const torch::optional<StringOrListOfStrings> maybeNames = torch::optional<StringOrListOfStrings>(),
-          bool compressed = false,
-          bool verbose = false);
-
-
-/// @brief Load a grid batch from a .nvdb file. This function loads each nanovdb grid into the batch as well
+void save(const std::string &path, const GridBatch &gridBatch,
+          const torch::optional<JaggedTensor>          maybeData = torch::optional<JaggedTensor>(),
+          const torch::optional<StringOrListOfStrings> maybeNames =
+              torch::optional<StringOrListOfStrings>(),
+          bool compressed = false, bool verbose = false);
+
+/// @brief Load a grid batch from a .nvdb file. This function loads each nanovdb grid into the batch
+/// as well
 ///        as a list of tensors containing the data at each grid in the batch
 ///        (e.g. a Vec3d grid will load a [num_voxels, 3] float64 tensor)
 /// @param path The path to the .nvdb file to load
-/// @param gridIdentifier The identifier (index, list of indices, name, list of names) to load from the file
+/// @param gridIdentifier The identifier (index, list of indices, name, list of names) to load from
+/// the file
 /// @param device Which device to load the grid batch on
 /// @param verbose If set to true, print information about the loaded grids
-/// @return A triple (gridbatch, data, names) where gridbatch is a GridBatch containing the loaded grids,
-///         data is a JaggedTensor containing the data of the grids, and names is a list of strings containing
-///         the name of each grid
+/// @return A triple (gridbatch, data, names) where gridbatch is a GridBatch containing the loaded
+/// grids,
+///         data is a JaggedTensor containing the data of the grids, and names is a list of strings
+///         containing the name of each grid
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-load(const std::string& path,
-     NanoVDBFileGridIdentifier gridIdentifier,
-     TorchDeviceOrString device,
+load(const std::string &path, NanoVDBFileGridIdentifier gridIdentifier, TorchDeviceOrString device,
      bool verbose = false);
 
+} // namespace fvdb
 
-} // namespace fvdb
\ No newline at end of file
+#endif // FVDB_FVDB_H
\ No newline at end of file
diff --git a/fvdb/src/GridBatch.cpp b/fvdb/src/GridBatch.cpp
index f40edb19ce..2a2e0a90fd 100644
--- a/fvdb/src/GridBatch.cpp
+++ b/fvdb/src/GridBatch.cpp
@@ -5,12 +5,10 @@
 
 #include "FVDB.h"
 #include "detail/GridBatchImpl.h"
-#include "detail/build/Build.h"
-#include "detail/ops/Ops.h"
 #include "detail/autograd/Autograd.h"
+#include "detail/build/Build.h"
 #include "detail/io/IO.h"
-
-
+#include "detail/ops/Ops.h"
 
 namespace fvdb {
 
@@ -18,22 +16,22 @@ GridBatch::GridBatch(TorchDeviceOrString device, bool isMutable) {
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(device.value(), isMutable);
 }
 
-
 GridBatch::GridBatch() {
-    mImpl = c10::make_intrusive<detail::GridBatchImpl>(detail::build::buildEmptyGrid(torch::kCPU, false), nanovdb::Vec3d(1.0), nanovdb::Vec3d(0.0));
+    mImpl = c10::make_intrusive<detail::GridBatchImpl>(
+        detail::build::buildEmptyGrid(torch::kCPU, false), nanovdb::Vec3d(1.0),
+        nanovdb::Vec3d(0.0));
 }
 
-
-std::pair<JaggedTensor, GridBatch> GridBatch::max_pool(Vec3iOrScalar pool_factor,
-                                                       const JaggedTensor& data,
-                                                       Vec3iOrScalar stride,
-                                                       torch::optional<GridBatch> coarse_grid) const {
-    TORCH_CHECK_VALUE(data.ldim() == 1,
-        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got", data.ldim(), "list dimensions"
-    );
+std::pair<JaggedTensor, GridBatch>
+GridBatch::max_pool(Vec3iOrScalar pool_factor, const JaggedTensor &data, Vec3iOrScalar stride,
+                    torch::optional<GridBatch> coarse_grid) const {
+    TORCH_CHECK_VALUE(
+        data.ldim() == 1,
+        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        data.ldim(), "list dimensions");
 
     nanovdb::Coord pool_factor_coord = pool_factor.value();
-    nanovdb::Coord stride_coord = stride.value();
+    nanovdb::Coord stride_coord      = stride.value();
 
     for (int i = 0; i < 3; i += 1) {
         if (stride_coord[i] == 0) {
@@ -51,23 +49,20 @@ std::pair<JaggedTensor, GridBatch> GridBatch::max_pool(Vec3iOrScalar pool_factor
     torch::Tensor pool_data = detail::autograd::MaxPoolGrid::apply(
         impl(), coarse_grid_impl, pool_factor_coord, stride_coord, data.jdata())[0];
 
-    return std::make_pair(
-        coarse_grid_impl->jaggedTensor(pool_data, false),
-        GridBatch(coarse_grid_impl)
-    );
+    return std::make_pair(coarse_grid_impl->jaggedTensor(pool_data, false),
+                          GridBatch(coarse_grid_impl));
 }
 
-
-std::pair<JaggedTensor, GridBatch> GridBatch::avg_pool(Vec3iOrScalar pool_factor,
-                                                       const JaggedTensor& data,
-                                                       Vec3iOrScalar stride,
-                                                       torch::optional<GridBatch> coarse_grid) const {
-    TORCH_CHECK_VALUE(data.ldim() == 1,
-        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got", data.ldim(), "list dimensions"
-    );
+std::pair<JaggedTensor, GridBatch>
+GridBatch::avg_pool(Vec3iOrScalar pool_factor, const JaggedTensor &data, Vec3iOrScalar stride,
+                    torch::optional<GridBatch> coarse_grid) const {
+    TORCH_CHECK_VALUE(
+        data.ldim() == 1,
+        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        data.ldim(), "list dimensions");
 
     nanovdb::Coord pool_factor_coord = pool_factor.value();
-    nanovdb::Coord stride_coord = stride.value();
+    nanovdb::Coord stride_coord      = stride.value();
 
     for (int i = 0; i < 3; i += 1) {
         if (stride_coord[i] == 0) {
@@ -85,25 +80,23 @@ std::pair<JaggedTensor, GridBatch> GridBatch::avg_pool(Vec3iOrScalar pool_factor
     torch::Tensor pool_data = detail::autograd::AvgPoolGrid::apply(
         impl(), coarse_grid_impl, pool_factor_coord, stride_coord, data.jdata())[0];
 
-    return std::make_pair(
-        coarse_grid_impl->jaggedTensor(pool_data, false),
-        GridBatch(coarse_grid_impl)
-    );
+    return std::make_pair(coarse_grid_impl->jaggedTensor(pool_data, false),
+                          GridBatch(coarse_grid_impl));
 }
 
-
-std::pair<JaggedTensor, GridBatch> GridBatch::subdivide(Vec3iOrScalar subdiv_factor,
-                                                        const JaggedTensor& data,
-                                                        const torch::optional<JaggedTensor> mask,
-                                                        torch::optional<GridBatch> fine_grid) const {
-
-    TORCH_CHECK_VALUE(data.ldim() == 1,
-        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got", data.ldim(), "list dimensions"
-    );
+std::pair<JaggedTensor, GridBatch>
+GridBatch::subdivide(Vec3iOrScalar subdiv_factor, const JaggedTensor &data,
+                     const torch::optional<JaggedTensor> mask,
+                     torch::optional<GridBatch>          fine_grid) const {
+    TORCH_CHECK_VALUE(
+        data.ldim() == 1,
+        "Expected data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        data.ldim(), "list dimensions");
     if (mask.has_value()) {
-        TORCH_CHECK_VALUE(mask.value().ldim() == 1,
-            "Expected mask to have 1 list dimension, i.e. be a single list of coordinate values, but got", mask.value().ldim(), "list dimensions"
-        );
+        TORCH_CHECK_VALUE(
+            mask.value().ldim() == 1,
+            "Expected mask to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+            mask.value().ldim(), "list dimensions");
     }
     const nanovdb::Coord upsampleFactorCoord = subdiv_factor.value();
 
@@ -114,66 +107,70 @@ std::pair<JaggedTensor, GridBatch> GridBatch::subdivide(Vec3iOrScalar subdiv_fac
         fineGrid = subdivided_grid(subdiv_factor, mask).impl();
     }
 
-    torch::Tensor subdivData = detail::autograd::UpsampleGrid::apply(impl(), fineGrid, upsampleFactorCoord, data.jdata())[0];
+    torch::Tensor subdivData = detail::autograd::UpsampleGrid::apply(
+        impl(), fineGrid, upsampleFactorCoord, data.jdata())[0];
 
-    return std::make_pair(
-        fineGrid->jaggedTensor(subdivData, false),
-        GridBatch(fineGrid)
-    );
+    return std::make_pair(fineGrid->jaggedTensor(subdivData, false), GridBatch(fineGrid));
 }
 
-
-JaggedTensor GridBatch::read_from_dense(const torch::Tensor& dense_data,
-                                        const Vec3iBatch& dense_origins) const {
-    torch::Tensor retData = detail::autograd::ReadFromDense::apply(impl(), dense_data, dense_origins)[0];
+JaggedTensor
+GridBatch::read_from_dense(const torch::Tensor &dense_data, const Vec3iBatch &dense_origins) const {
+    torch::Tensor retData =
+        detail::autograd::ReadFromDense::apply(impl(), dense_data, dense_origins)[0];
     return impl()->jaggedTensor(retData, false);
 }
 
-
-torch::Tensor GridBatch::read_into_dense(const JaggedTensor& sparse_data,
-                                         const torch::optional<Vec3iBatch>& min_coord,
-                                         const torch::optional<Vec3i>& grid_size) const {
-    TORCH_CHECK_VALUE(sparse_data.ldim() == 1,
-        "Expected sparse_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", sparse_data.ldim(), "list dimensions"
-    );
-    return detail::autograd::ReadIntoDense::apply(impl(), sparse_data.jdata(), min_coord, grid_size)[0];
-}
-
-JaggedTensor GridBatch::fill_to_grid(const JaggedTensor& features,
-                                     const GridBatch& other_grid,
-                                     float default_value) const {
-    TORCH_CHECK_VALUE(features.ldim() == 1,
-        "Expected features to have 1 list dimension, i.e. be a single list of coordinate values, but got", features.ldim(), "list dimensions"
-    );
+torch::Tensor
+GridBatch::read_into_dense(const JaggedTensor                &sparse_data,
+                           const torch::optional<Vec3iBatch> &min_coord,
+                           const torch::optional<Vec3i>      &grid_size) const {
+    TORCH_CHECK_VALUE(
+        sparse_data.ldim() == 1,
+        "Expected sparse_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        sparse_data.ldim(), "list dimensions");
+    return detail::autograd::ReadIntoDense::apply(impl(), sparse_data.jdata(), min_coord,
+                                                  grid_size)[0];
+}
+
+JaggedTensor
+GridBatch::fill_to_grid(const JaggedTensor &features, const GridBatch &other_grid,
+                        float default_value) const {
+    TORCH_CHECK_VALUE(
+        features.ldim() == 1,
+        "Expected features to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        features.ldim(), "list dimensions");
     torch::Tensor retData = detail::autograd::FillToGrid::apply(other_grid.impl(), impl(),
                                                                 features.jdata(), default_value)[0];
 
     return impl()->jaggedTensor(retData, false);
 }
 
-
-JaggedTensor GridBatch::grid_to_world(const JaggedTensor& ijk) const {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::grid_to_world(const JaggedTensor &ijk) const {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     torch::Tensor ret = detail::autograd::TransformPoints::apply(
         impl(), ijk, ijk.jdata(), true /*isInverse*/, false /*isDual*/)[0];
 
     return ijk.jagged_like(ret);
 }
 
-
-JaggedTensor GridBatch::world_to_grid(const JaggedTensor& xyz) const {
-    TORCH_CHECK_VALUE(xyz.ldim() == 1,
-        "Expected xyz to have 1 list dimension, i.e. be a single list of coordinate values, but got", xyz.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::world_to_grid(const JaggedTensor &xyz) const {
+    TORCH_CHECK_VALUE(
+        xyz.ldim() == 1,
+        "Expected xyz to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        xyz.ldim(), "list dimensions");
     torch::Tensor ret = detail::autograd::TransformPoints::apply(
         impl(), xyz, xyz.jdata(), false /* isInverse*/, false /*isDual*/)[0];
 
     return xyz.jagged_like(ret);
 }
 
-torch::Tensor GridBatch::grid_to_world_matrices(const torch::Dtype& dtype) const {
+torch::Tensor
+GridBatch::grid_to_world_matrices(const torch::Dtype &dtype) const {
     std::vector<torch::Tensor> retTorch;
     for (int64_t bi = 0; bi < grid_count(); ++bi) {
         retTorch.emplace_back(impl()->gridToWorldMatrix(bi));
@@ -182,7 +179,8 @@ torch::Tensor GridBatch::grid_to_world_matrices(const torch::Dtype& dtype) const
     return torch::stack(retTorch, 0).toType(dtype);
 }
 
-torch::Tensor GridBatch::world_to_grid_matrices(const torch::Dtype& dtype) const {
+torch::Tensor
+GridBatch::world_to_grid_matrices(const torch::Dtype &dtype) const {
     std::vector<torch::Tensor> retTorch;
     for (int64_t bi = 0; bi < grid_count(); ++bi) {
         retTorch.emplace_back(impl()->worldToGridMatrix(bi));
@@ -191,68 +189,81 @@ torch::Tensor GridBatch::world_to_grid_matrices(const torch::Dtype& dtype) const
     return torch::stack(retTorch, 0).toType(dtype);
 }
 
-JaggedTensor GridBatch::sample_trilinear(const JaggedTensor& points,
-                                         const JaggedTensor& voxel_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(voxel_data.ldim() == 1,
-        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", voxel_data.ldim(), "list dimensions"
-    );
-    torch::Tensor ret = detail::autograd::SampleGridTrilinear::apply(impl(), points, voxel_data.jdata(), false /*returnGrad*/)[0];
+JaggedTensor
+GridBatch::sample_trilinear(const JaggedTensor &points, const JaggedTensor &voxel_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        voxel_data.ldim() == 1,
+        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        voxel_data.ldim(), "list dimensions");
+    torch::Tensor ret = detail::autograd::SampleGridTrilinear::apply(
+        impl(), points, voxel_data.jdata(), false /*returnGrad*/)[0];
     return points.jagged_like(ret);
 }
 
-
-std::vector<JaggedTensor> GridBatch::sample_trilinear_with_grad(const JaggedTensor& points,
-                                                                const JaggedTensor& voxel_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(voxel_data.ldim() == 1,
-        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", voxel_data.ldim(), "list dimensions"
-    );
-    std::vector<torch::Tensor> ret = detail::autograd::SampleGridTrilinear::apply(impl(), points, voxel_data.jdata(), true /*returnGrad*/);
-
-    return {points.jagged_like(ret[0]), points.jagged_like(ret[1])};
-}
-
-
-JaggedTensor GridBatch::sample_bezier(const JaggedTensor& points,
-                                      const JaggedTensor& voxel_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(voxel_data.ldim() == 1,
-        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", voxel_data.ldim(), "list dimensions"
-    );
-    torch::Tensor ret = detail::autograd::SampleGridBezier::apply(impl(), points, voxel_data.jdata(), false /*returnGrad*/)[0];
+std::vector<JaggedTensor>
+GridBatch::sample_trilinear_with_grad(const JaggedTensor &points,
+                                      const JaggedTensor &voxel_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        voxel_data.ldim() == 1,
+        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        voxel_data.ldim(), "list dimensions");
+    std::vector<torch::Tensor> ret = detail::autograd::SampleGridTrilinear::apply(
+        impl(), points, voxel_data.jdata(), true /*returnGrad*/);
+
+    return { points.jagged_like(ret[0]), points.jagged_like(ret[1]) };
+}
+
+JaggedTensor
+GridBatch::sample_bezier(const JaggedTensor &points, const JaggedTensor &voxel_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        voxel_data.ldim() == 1,
+        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        voxel_data.ldim(), "list dimensions");
+    torch::Tensor ret = detail::autograd::SampleGridBezier::apply(
+        impl(), points, voxel_data.jdata(), false /*returnGrad*/)[0];
     return points.jagged_like(ret);
 }
 
-
-std::vector<JaggedTensor> GridBatch::sample_bezier_with_grad(const JaggedTensor& points,
-                                                             const JaggedTensor& voxel_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(voxel_data.ldim() == 1,
-        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", voxel_data.ldim(), "list dimensions"
-    );
-    auto ret = detail::autograd::SampleGridBezier::apply(impl(), points, voxel_data.jdata(), true /*returnGrad*/);
-    return {points.jagged_like(ret[0]), points.jagged_like(ret[1])};
-}
-
-
-JaggedTensor GridBatch::splat_trilinear(const JaggedTensor& points,
-                                        const JaggedTensor& points_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(points_data.ldim() == 1,
-        "Expected points_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", points_data.ldim(), "list dimensions"
-    );
-    torch::Tensor ret = detail::autograd::SplatIntoGridTrilinear::apply(impl(), points, points_data.jdata())[0];
+std::vector<JaggedTensor>
+GridBatch::sample_bezier_with_grad(const JaggedTensor &points,
+                                   const JaggedTensor &voxel_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        voxel_data.ldim() == 1,
+        "Expected voxel_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        voxel_data.ldim(), "list dimensions");
+    auto ret = detail::autograd::SampleGridBezier::apply(impl(), points, voxel_data.jdata(),
+                                                         true /*returnGrad*/);
+    return { points.jagged_like(ret[0]), points.jagged_like(ret[1]) };
+}
+
+JaggedTensor
+GridBatch::splat_trilinear(const JaggedTensor &points, const JaggedTensor &points_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        points_data.ldim() == 1,
+        "Expected points_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points_data.ldim(), "list dimensions");
+    torch::Tensor ret =
+        detail::autograd::SplatIntoGridTrilinear::apply(impl(), points, points_data.jdata())[0];
     if (grid_count() == 1) {
         return JaggedTensor(ret);
     } else {
@@ -260,16 +271,18 @@ JaggedTensor GridBatch::splat_trilinear(const JaggedTensor& points,
     }
 }
 
-
-JaggedTensor GridBatch::splat_bezier(const JaggedTensor& points,
-                                     const JaggedTensor& points_data) const {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(points_data.ldim() == 1,
-        "Expected points_data to have 1 list dimension, i.e. be a single list of coordinate values, but got", points_data.ldim(), "list dimensions"
-    );
-    torch::Tensor ret = detail::autograd::SplatIntoGridBezier::apply(impl(), points, points_data.jdata())[0];
+JaggedTensor
+GridBatch::splat_bezier(const JaggedTensor &points, const JaggedTensor &points_data) const {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        points_data.ldim() == 1,
+        "Expected points_data to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points_data.ldim(), "list dimensions");
+    torch::Tensor ret =
+        detail::autograd::SplatIntoGridBezier::apply(impl(), points, points_data.jdata())[0];
     if (grid_count() == 1) {
         return JaggedTensor(ret);
     } else {
@@ -277,51 +290,58 @@ JaggedTensor GridBatch::splat_bezier(const JaggedTensor& points,
     }
 }
 
-
-torch::Tensor GridBatch::voxel_size_at(int64_t bi, const torch::Dtype& dtype) const {
-    torch::Tensor retTorch = torch::empty({3}, torch::TensorOptions().device(this->device()).dtype(dtype));
-    const nanovdb::Vec3d& voxSize = impl()->voxelSize(bi);
-    retTorch[0] = voxSize[0];
-    retTorch[1] = voxSize[1];
-    retTorch[2] = voxSize[2];
+torch::Tensor
+GridBatch::voxel_size_at(int64_t bi, const torch::Dtype &dtype) const {
+    torch::Tensor retTorch =
+        torch::empty({ 3 }, torch::TensorOptions().device(this->device()).dtype(dtype));
+    const nanovdb::Vec3d &voxSize = impl()->voxelSize(bi);
+    retTorch[0]                   = voxSize[0];
+    retTorch[1]                   = voxSize[1];
+    retTorch[2]                   = voxSize[2];
     return retTorch;
 }
 
-torch::Tensor GridBatch::voxel_sizes(const torch::Dtype& dtype) const {
-    torch::Tensor retTorch = torch::empty({grid_count(), 3}, torch::TensorOptions().device(this->device()).dtype(dtype));
+torch::Tensor
+GridBatch::voxel_sizes(const torch::Dtype &dtype) const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count(), 3 }, torch::TensorOptions().device(this->device()).dtype(dtype));
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
         const nanovdb::Vec3d voxSize = impl()->voxelSize(bi);
-        retTorch[bi][0] = voxSize[0];
-        retTorch[bi][1] = voxSize[1];
-        retTorch[bi][2] = voxSize[2];
+        retTorch[bi][0]              = voxSize[0];
+        retTorch[bi][1]              = voxSize[1];
+        retTorch[bi][2]              = voxSize[2];
     }
     return retTorch;
 }
 
-torch::Tensor GridBatch::origin_at(int64_t bi, const torch::Dtype& dtype) const {
-    const nanovdb::Vec3d& voxelOrigin = impl()->voxelOrigin(bi);
-    torch::Tensor retTorch = torch::empty({3}, torch::TensorOptions().device(this->device()).dtype(dtype));
+torch::Tensor
+GridBatch::origin_at(int64_t bi, const torch::Dtype &dtype) const {
+    const nanovdb::Vec3d &voxelOrigin = impl()->voxelOrigin(bi);
+    torch::Tensor         retTorch =
+        torch::empty({ 3 }, torch::TensorOptions().device(this->device()).dtype(dtype));
     retTorch[0] = voxelOrigin[0];
     retTorch[1] = voxelOrigin[1];
     retTorch[2] = voxelOrigin[2];
     return retTorch;
 }
 
-
-torch::Tensor GridBatch::origins(const torch::Dtype& dtype) const {
-    torch::Tensor retTorch = torch::empty({grid_count(), 3}, torch::TensorOptions().device(this->device()).dtype(dtype));
+torch::Tensor
+GridBatch::origins(const torch::Dtype &dtype) const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count(), 3 }, torch::TensorOptions().device(this->device()).dtype(dtype));
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
-        const nanovdb::Vec3d& voxOrigin = impl()->voxelOrigin(bi);
-        retTorch[bi][0] = voxOrigin[0];
-        retTorch[bi][1] = voxOrigin[1];
-        retTorch[bi][2] = voxOrigin[2];
+        const nanovdb::Vec3d &voxOrigin = impl()->voxelOrigin(bi);
+        retTorch[bi][0]                 = voxOrigin[0];
+        retTorch[bi][1]                 = voxOrigin[1];
+        retTorch[bi][2]                 = voxOrigin[2];
     }
     return retTorch;
 }
 
-
-torch::Tensor GridBatch::num_voxels() const {
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+torch::Tensor
+GridBatch::num_voxels() const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -330,11 +350,13 @@ torch::Tensor GridBatch::num_voxels() const {
     return retTorch.to(device());
 }
 
-torch::Tensor GridBatch::num_enabled_voxels() const {
+torch::Tensor
+GridBatch::num_enabled_voxels() const {
     if (!is_mutable()) {
         return num_voxels();
     }
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -343,7 +365,8 @@ torch::Tensor GridBatch::num_enabled_voxels() const {
     return retTorch.to(device());
 }
 
-int64_t GridBatch::num_enabled_voxels_at(int64_t bi) const {
+int64_t
+GridBatch::num_enabled_voxels_at(int64_t bi) const {
     if (!is_mutable()) {
         return num_voxels_at(bi);
     }
@@ -352,8 +375,10 @@ int64_t GridBatch::num_enabled_voxels_at(int64_t bi) const {
     });
 }
 
-torch::Tensor GridBatch::cum_voxels() const {
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+torch::Tensor
+GridBatch::cum_voxels() const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -362,11 +387,13 @@ torch::Tensor GridBatch::cum_voxels() const {
     return retTorch.to(device());
 }
 
-torch::Tensor GridBatch::cum_enabled_voxels() const {
+torch::Tensor
+GridBatch::cum_enabled_voxels() const {
     if (!is_mutable()) {
         return cum_voxels();
     }
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -375,7 +402,8 @@ torch::Tensor GridBatch::cum_enabled_voxels() const {
     return retTorch.to(device());
 }
 
-int64_t GridBatch::cum_enabled_voxels_at(int64_t bi) const {
+int64_t
+GridBatch::cum_enabled_voxels_at(int64_t bi) const {
     int64_t nCum = 0;
     for (int64_t b = 0; b < bi; ++b) {
         nCum += num_enabled_voxels_at(b);
@@ -383,8 +411,10 @@ int64_t GridBatch::cum_enabled_voxels_at(int64_t bi) const {
     return nCum;
 }
 
-torch::Tensor GridBatch::num_bytes() const {
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+torch::Tensor
+GridBatch::num_bytes() const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -393,9 +423,10 @@ torch::Tensor GridBatch::num_bytes() const {
     return retTorch.to(device());
 }
 
-
-torch::Tensor GridBatch::num_leaf_nodes() const {
-    torch::Tensor retTorch = torch::empty({grid_count()}, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
+torch::Tensor
+GridBatch::num_leaf_nodes() const {
+    torch::Tensor retTorch = torch::empty(
+        { grid_count() }, torch::TensorOptions().device(torch::kCPU).dtype(torch::kInt64));
     auto acc = retTorch.accessor<int64_t, 1>();
 
     for (int64_t bi = 0; bi < grid_count(); bi += 1) {
@@ -404,202 +435,232 @@ torch::Tensor GridBatch::num_leaf_nodes() const {
     return retTorch.to(device());
 }
 
-
-void GridBatch::disable_ijk(const JaggedTensor& ijk) {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+void
+GridBatch::disable_ijk(const JaggedTensor &ijk) {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         fvdb::detail::ops::dispatchSetMaskedIjk<DeviceTag>(*impl(), ijk, false);
     });
 }
 
-
-void GridBatch::enable_ijk(const JaggedTensor& ijk) {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+void
+GridBatch::enable_ijk(const JaggedTensor &ijk) {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         fvdb::detail::ops::dispatchSetMaskedIjk<DeviceTag>(*impl(), ijk, true);
     });
 }
 
-void GridBatch::set_from_mesh(const JaggedTensor& mesh_vertices,
-                              const JaggedTensor& mesh_faces,
-                              const Vec3dBatchOrScalar& voxel_sizes,
-                              const Vec3dBatch& origins) {
-    TORCH_CHECK_VALUE(mesh_vertices.ldim() == 1,
-        "Expected mesh_vertices to have 1 list dimension, i.e. be a single list of coordinate values, but got", mesh_vertices.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(mesh_faces.ldim() == 1,
-        "Expected mesh_faces to have 1 list dimension, i.e. be a single list of coordinate values, but got", mesh_faces.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_TYPE(mesh_vertices.is_floating_point(), "mesh_vertices must have a floating point type");
-    TORCH_CHECK_VALUE(mesh_vertices.rdim() == 2, std::string("Expected mesh_vertices to have 2 dimensions (shape (n, 3)) but got ") +
-                                                std::to_string(mesh_vertices.rdim()) + " dimensions");
+void
+GridBatch::set_from_mesh(const JaggedTensor &mesh_vertices, const JaggedTensor &mesh_faces,
+                         const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins) {
+    TORCH_CHECK_VALUE(
+        mesh_vertices.ldim() == 1,
+        "Expected mesh_vertices to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        mesh_vertices.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        mesh_faces.ldim() == 1,
+        "Expected mesh_faces to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        mesh_faces.ldim(), "list dimensions");
+    TORCH_CHECK_TYPE(mesh_vertices.is_floating_point(),
+                     "mesh_vertices must have a floating point type");
+    TORCH_CHECK_VALUE(
+        mesh_vertices.rdim() == 2,
+        std::string("Expected mesh_vertices to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(mesh_vertices.rdim()) + " dimensions");
     TORCH_CHECK_VALUE(mesh_vertices.rsize(1) == 3,
                       "Expected 3 dimensional mesh_vertices but got mesh_vertices.rshape[1] = " +
-                      std::to_string(mesh_vertices.rsize(1)));
+                          std::to_string(mesh_vertices.rsize(1)));
 
     TORCH_CHECK_TYPE(!mesh_faces.is_floating_point(), "mesh_faces must have an integer type");
-    TORCH_CHECK_VALUE(mesh_faces.rdim() == 2, std::string("Expected mesh_faces to have 2 dimensions (shape (n, 3)) but got ") +
-                                                std::to_string(mesh_faces.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        mesh_faces.rdim() == 2,
+        std::string("Expected mesh_faces to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(mesh_faces.rdim()) + " dimensions");
     TORCH_CHECK_VALUE(mesh_faces.rsize(1) == 3,
                       "Expected 3 dimensional mesh_faces but got mesh_faces.rshape[1] = " +
-                      std::to_string(mesh_faces.rsize(1)));
+                          std::to_string(mesh_faces.rsize(1)));
 
     TORCH_CHECK_VALUE(mesh_vertices.num_outer_lists() == mesh_faces.num_outer_lists(),
                       "Expected same number of vertex and face sets got len(mesh_vertices) = ",
-                      mesh_vertices.num_outer_lists(), " and len(mesh_faces) = ", mesh_faces.num_outer_lists());
+                      mesh_vertices.num_outer_lists(),
+                      " and len(mesh_faces) = ", mesh_faces.num_outer_lists());
     const int64_t numGrids = mesh_vertices.joffsets().size(0) - 1;
-    TORCH_CHECK(numGrids == mesh_vertices.num_outer_lists(), "If this happens, Francis' paranoia was justified. File a bug");
-    TORCH_CHECK_VALUE(numGrids <= MAX_GRIDS_PER_BATCH,
-                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH, " grids in a batch. ",
-                      "You passed in ", numGrids, " mesh sets.");
+    TORCH_CHECK(numGrids == mesh_vertices.num_outer_lists(),
+                "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK_VALUE(numGrids <= MAX_GRIDS_PER_BATCH, "Cannot create a grid with more than ",
+                      MAX_GRIDS_PER_BATCH, " grids in a batch. ", "You passed in ", numGrids,
+                      " mesh sets.");
 
-    const std::vector<nanovdb::Vec3d> voxSizesVec = voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
-    const std::vector<nanovdb::Vec3d> voxOriginsVec = origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
+    const std::vector<nanovdb::Vec3d> voxSizesVec =
+        voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
+    const std::vector<nanovdb::Vec3d> voxOriginsVec =
+        origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
 
     std::vector<detail::VoxelCoordTransform> transforms;
     transforms.reserve(numGrids);
     for (int64_t i = 0; i < numGrids; i += 1) {
-        transforms.push_back(detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
+        transforms.push_back(
+            detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
     }
 
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildGridFromMesh(is_mutable(), mesh_vertices, mesh_faces, transforms),
-         voxSizesVec, voxOriginsVec);
+        detail::build::buildGridFromMesh(is_mutable(), mesh_vertices, mesh_faces, transforms),
+        voxSizesVec, voxOriginsVec);
 }
 
-
-void GridBatch::set_from_points(const JaggedTensor& points,
-                                const Vec3i& pad_min,
-                                const Vec3i& pad_max,
-                                const Vec3dBatchOrScalar& voxel_sizes,
-                                const Vec3dBatch& origins) {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
+void
+GridBatch::set_from_points(const JaggedTensor &points, const Vec3i &pad_min, const Vec3i &pad_max,
+                           const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins) {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
-    TORCH_CHECK_VALUE(points.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                         std::to_string(points.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(points.rdim() == 2,
+                      std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                          std::to_string(points.rdim()) + " dimensions");
     TORCH_CHECK_VALUE(points.rsize(1) == 3,
                       "Expected 3 dimensional points but got points.rshape[1] = " +
-                      std::to_string(points.rsize(1)));
+                          std::to_string(points.rsize(1)));
     impl()->checkDevice(points);
-    TORCH_CHECK(points.num_tensors() == points.num_outer_lists(), "If this happens, Francis' paranoia about tensors and points was justified. File a bug");
+    TORCH_CHECK(
+        points.num_tensors() == points.num_outer_lists(),
+        "If this happens, Francis' paranoia about tensors and points was justified. File a bug");
     TORCH_CHECK_VALUE(points.num_outer_lists() <= MAX_GRIDS_PER_BATCH,
-                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH, " grids in a batch. ",
-                      "You passed in ", points.num_outer_lists(), " points sets.");
+                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH,
+                      " grids in a batch. ", "You passed in ", points.num_outer_lists(),
+                      " points sets.");
 
     const nanovdb::Coord padMin = pad_min.value();
     const nanovdb::Coord padMax = pad_max.value();
 
     const int64_t numGrids = points.joffsets().size(0) - 1;
-    TORCH_CHECK(numGrids == points.num_outer_lists(), "If this happens, Francis' paranoia about grids and points was justified. File a bug");
+    TORCH_CHECK(
+        numGrids == points.num_outer_lists(),
+        "If this happens, Francis' paranoia about grids and points was justified. File a bug");
 
-    const std::vector<nanovdb::Vec3d> voxSizesVec = voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
-    const std::vector<nanovdb::Vec3d> voxOriginsVec = origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
+    const std::vector<nanovdb::Vec3d> voxSizesVec =
+        voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
+    const std::vector<nanovdb::Vec3d> voxOriginsVec =
+        origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
 
     std::vector<detail::VoxelCoordTransform> transforms;
     transforms.reserve(numGrids);
     for (int64_t i = 0; i < numGrids; i += 1) {
-        transforms.push_back(detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
+        transforms.push_back(
+            detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
     }
 
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildPaddedGridFromPoints(is_mutable(), points, transforms, padMin, padMax),
-         voxSizesVec, voxOriginsVec);
-}
-
-
-void GridBatch::set_from_nearest_voxels_to_points(const JaggedTensor& points,
-                                                  const Vec3dBatchOrScalar& voxel_sizes,
-                                                  const Vec3dBatch& origins) {
-    TORCH_CHECK_VALUE(points.ldim() == 1,
-        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got", points.ldim(), "list dimensions"
-    );
+        detail::build::buildPaddedGridFromPoints(is_mutable(), points, transforms, padMin, padMax),
+        voxSizesVec, voxOriginsVec);
+}
+
+void
+GridBatch::set_from_nearest_voxels_to_points(const JaggedTensor       &points,
+                                             const Vec3dBatchOrScalar &voxel_sizes,
+                                             const Vec3dBatch         &origins) {
+    TORCH_CHECK_VALUE(
+        points.ldim() == 1,
+        "Expected points to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        points.ldim(), "list dimensions");
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
-    TORCH_CHECK_VALUE(points.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                         std::to_string(points.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(points.rdim() == 2,
+                      std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                          std::to_string(points.rdim()) + " dimensions");
     TORCH_CHECK_VALUE(points.rsize(1) == 3,
                       "Expected 3 dimensional points but got points.shape[1] = " +
-                      std::to_string(points.rsize(1)));
+                          std::to_string(points.rsize(1)));
     impl()->checkDevice(points);
-    TORCH_CHECK(points.num_tensors() == points.num_outer_lists(), "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK(points.num_tensors() == points.num_outer_lists(),
+                "If this happens, Francis' paranoia was justified. File a bug");
     TORCH_CHECK_VALUE(points.num_outer_lists() <= MAX_GRIDS_PER_BATCH,
-                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH, " grids in a batch. ",
-                      "You passed in ", points.num_outer_lists(), " point sets.");
+                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH,
+                      " grids in a batch. ", "You passed in ", points.num_outer_lists(),
+                      " point sets.");
 
     const int64_t numGrids = points.joffsets().size(0) - 1;
-    TORCH_CHECK(numGrids == points.num_outer_lists(), "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK(numGrids == points.num_outer_lists(),
+                "If this happens, Francis' paranoia was justified. File a bug");
 
-    const std::vector<nanovdb::Vec3d> voxSizesVec = voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
-    const std::vector<nanovdb::Vec3d> voxOriginsVec = origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
+    const std::vector<nanovdb::Vec3d> voxSizesVec =
+        voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
+    const std::vector<nanovdb::Vec3d> voxOriginsVec =
+        origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
 
     std::vector<detail::VoxelCoordTransform> transforms;
     transforms.reserve(numGrids);
     for (int64_t i = 0; i < numGrids; i += 1) {
-        transforms.push_back(detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
+        transforms.push_back(
+            detail::primalVoxelTransformForSizeAndOrigin(voxSizesVec[i], voxOriginsVec[i]));
     }
 
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildNearestNeighborGridFromPoints(is_mutable(), points, transforms),
-         voxSizesVec, voxOriginsVec);
-}
-
-
-void GridBatch::set_from_ijk(const JaggedTensor& coords,
-                             const Vec3i& pad_min,
-                             const Vec3i& pad_max,
-                             const Vec3dBatchOrScalar& voxel_sizes,
-                             const Vec3dBatch& origins) {
-    TORCH_CHECK_VALUE(coords.ldim() == 1,
-        "Expected coords to have 1 list dimension, i.e. be a single list of coordinate values, but got", coords.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_TYPE(at::isIntegralType(coords.scalar_type(), false), "coords must have an integer type");
-    TORCH_CHECK_VALUE(coords.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                std::to_string(coords.rdim()) + " dimensions");
+        detail::build::buildNearestNeighborGridFromPoints(is_mutable(), points, transforms),
+        voxSizesVec, voxOriginsVec);
+}
+
+void
+GridBatch::set_from_ijk(const JaggedTensor &coords, const Vec3i &pad_min, const Vec3i &pad_max,
+                        const Vec3dBatchOrScalar &voxel_sizes, const Vec3dBatch &origins) {
+    TORCH_CHECK_VALUE(
+        coords.ldim() == 1,
+        "Expected coords to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        coords.ldim(), "list dimensions");
+    TORCH_CHECK_TYPE(at::isIntegralType(coords.scalar_type(), false),
+                     "coords must have an integer type");
+    TORCH_CHECK_VALUE(coords.rdim() == 2,
+                      std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                          std::to_string(coords.rdim()) + " dimensions");
     TORCH_CHECK_VALUE(coords.rsize(1) == 3,
                       "Expected 3 dimensional coords but got points.rshape[1] = " +
-                      std::to_string(coords.rsize(1)));
+                          std::to_string(coords.rsize(1)));
     impl()->checkDevice(coords);
-    TORCH_CHECK(coords.num_tensors() == coords.num_outer_lists(), "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK(coords.num_tensors() == coords.num_outer_lists(),
+                "If this happens, Francis' paranoia was justified. File a bug");
     TORCH_CHECK_VALUE(coords.num_outer_lists() <= MAX_GRIDS_PER_BATCH,
-                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH, " grids in a batch. ",
-                      "You passed in ", coords.num_outer_lists(), " coordinate sets.");
+                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH,
+                      " grids in a batch. ", "You passed in ", coords.num_outer_lists(),
+                      " coordinate sets.");
 
-    const nanovdb::Coord& padMin = pad_min.value();
-    const nanovdb::Coord& padMax = pad_max.value();
+    const nanovdb::Coord &padMin = pad_min.value();
+    const nanovdb::Coord &padMax = pad_max.value();
 
     const int64_t numGrids = coords.joffsets().size(0) - 1;
-    TORCH_CHECK(numGrids == coords.num_outer_lists(), "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK(numGrids == coords.num_outer_lists(),
+                "If this happens, Francis' paranoia was justified. File a bug");
 
-    const std::vector<nanovdb::Vec3d> voxSizesVec = voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
-    const std::vector<nanovdb::Vec3d> voxOriginsVec = origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
+    const std::vector<nanovdb::Vec3d> voxSizesVec =
+        voxel_sizes.value(numGrids, true /* onlyPositive */, "voxel_sizes");
+    const std::vector<nanovdb::Vec3d> voxOriginsVec =
+        origins.value(numGrids, false /* onlyPositive */, "voxel_origins");
 
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildPaddedGridFromCoords(is_mutable(), coords, padMin, padMax),
-         voxSizesVec, voxOriginsVec);
+        detail::build::buildPaddedGridFromCoords(is_mutable(), coords, padMin, padMax), voxSizesVec,
+        voxOriginsVec);
 }
 
-
-void GridBatch::set_from_dense_grid(const int64_t num_grids,
-                                    const Vec3i& dense_dims,
-                                    const Vec3i& ijk_min,
-                                    const Vec3dBatchOrScalar& voxel_sizes,
-                                    const Vec3dBatch& origins,
-                                    torch::optional<torch::Tensor> mask) {
-
+void
+GridBatch::set_from_dense_grid(const int64_t num_grids, const Vec3i &dense_dims,
+                               const Vec3i &ijk_min, const Vec3dBatchOrScalar &voxel_sizes,
+                               const Vec3dBatch &origins, torch::optional<torch::Tensor> mask) {
     TORCH_CHECK_VALUE(num_grids >= 0, "num_grids must be non-negative");
 
-    const nanovdb::Coord& size = dense_dims.value();
+    const nanovdb::Coord &size = dense_dims.value();
 
-    const nanovdb::Coord& ijk_min_value = ijk_min.value();
+    const nanovdb::Coord &ijk_min_value = ijk_min.value();
 
     if (mask.has_value()) {
         impl()->checkDevice(mask.value());
-        TORCH_CHECK_VALUE(mask.value().dtype() == torch::kBool, "mask must be a boolean type or None");
+        TORCH_CHECK_VALUE(mask.value().dtype() == torch::kBool,
+                          "mask must be a boolean type or None");
         TORCH_CHECK_VALUE(mask.value().dim() == 3, "mask must be 3 dimensional");
         TORCH_CHECK_VALUE(mask.value().size(0) == size[0], "mask must have shape (w, h, d) = size");
         TORCH_CHECK_VALUE(mask.value().size(1) == size[1], "mask must have shape (w, h, d) = size");
@@ -608,23 +669,26 @@ void GridBatch::set_from_dense_grid(const int64_t num_grids,
 
     TORCH_CHECK_VALUE(size[0] >= 0 && size[1] >= 0 && size[2] >= 0, "size must be non-negative");
 
-    std::vector<nanovdb::Vec3d> voxSizesVec = voxel_sizes.value(num_grids, true /* onlyPositive */, "voxel_sizes");
-    std::vector<nanovdb::Vec3d> voxOriginsVec = origins.value(num_grids, false /* onlyPositive */, "voxel_origins");
+    std::vector<nanovdb::Vec3d> voxSizesVec =
+        voxel_sizes.value(num_grids, true /* onlyPositive */, "voxel_sizes");
+    std::vector<nanovdb::Vec3d> voxOriginsVec =
+        origins.value(num_grids, false /* onlyPositive */, "voxel_origins");
 
-    TORCH_CHECK_VALUE(num_grids <= MAX_GRIDS_PER_BATCH,
-                      "Cannot create a grid with more than ", MAX_GRIDS_PER_BATCH, " grids in a batch. ",
-                      "You requested ", num_grids, " grids.");
-    TORCH_CHECK((size_t) num_grids == voxSizesVec.size(), "If this happens, Francis' paranoia was justified. File a bug");
-    TORCH_CHECK((size_t) num_grids == voxOriginsVec.size(), "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK_VALUE(num_grids <= MAX_GRIDS_PER_BATCH, "Cannot create a grid with more than ",
+                      MAX_GRIDS_PER_BATCH, " grids in a batch. ", "You requested ", num_grids,
+                      " grids.");
+    TORCH_CHECK((size_t)num_grids == voxSizesVec.size(),
+                "If this happens, Francis' paranoia was justified. File a bug");
+    TORCH_CHECK((size_t)num_grids == voxOriginsVec.size(),
+                "If this happens, Francis' paranoia was justified. File a bug");
 
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildDenseGrid(device(), is_mutable(), num_grids, size, ijk_min_value, mask),
-         voxSizesVec, voxOriginsVec);
-
+        detail::build::buildDenseGrid(device(), is_mutable(), num_grids, size, ijk_min_value, mask),
+        voxSizesVec, voxOriginsVec);
 }
 
-
-GridBatch GridBatch::dual_grid(bool exclude_border) const {
+GridBatch
+GridBatch::dual_grid(bool exclude_border) const {
     GridBatch ret = GridBatch(device(), is_mutable());
     if (grid_count() == 0) {
         return ret;
@@ -633,14 +697,15 @@ GridBatch GridBatch::dual_grid(bool exclude_border) const {
     return ret;
 }
 
-
-GridBatch GridBatch::coarsened_grid(Vec3iOrScalar branch_factor) const {
+GridBatch
+GridBatch::coarsened_grid(Vec3iOrScalar branch_factor) const {
     nanovdb::Coord branchFactorCoord = branch_factor.value();
     for (int i = 0; i < 3; i += 1) {
-        TORCH_CHECK_VALUE(branchFactorCoord[i] > 0, "branch_factor must be strictly positive. Got [" +
-                                                    std::to_string(branchFactorCoord[0]) + ", " +
-                                                    std::to_string(branchFactorCoord[1]) + ", " +
-                                                    std::to_string(branchFactorCoord[2]) + "]");
+        TORCH_CHECK_VALUE(branchFactorCoord[i] > 0,
+                          "branch_factor must be strictly positive. Got [" +
+                              std::to_string(branchFactorCoord[0]) + ", " +
+                              std::to_string(branchFactorCoord[1]) + ", " +
+                              std::to_string(branchFactorCoord[2]) + "]");
     }
     GridBatch ret(device(), is_mutable());
     if (grid_count() == 0) {
@@ -650,20 +715,22 @@ GridBatch GridBatch::coarsened_grid(Vec3iOrScalar branch_factor) const {
     return ret;
 }
 
-
-GridBatch GridBatch::subdivided_grid(Vec3iOrScalar subdiv_factor, const torch::optional<JaggedTensor> mask) const {
-
+GridBatch
+GridBatch::subdivided_grid(Vec3iOrScalar                       subdiv_factor,
+                           const torch::optional<JaggedTensor> mask) const {
     if (mask.has_value()) {
-        TORCH_CHECK_VALUE(mask.value().ldim() == 1,
-            "Expected mask to have 1 list dimension, i.e. be a single list of coordinate values, but got", mask.value().ldim(), "list dimensions"
-        );
+        TORCH_CHECK_VALUE(
+            mask.value().ldim() == 1,
+            "Expected mask to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+            mask.value().ldim(), "list dimensions");
     }
     const nanovdb::Coord subdivFactorCoord = subdiv_factor.value();
     for (int i = 0; i < 3; i += 1) {
-        TORCH_CHECK_VALUE(subdivFactorCoord[i] > 0, "subdiv_factor must be strictly positive. Got [" +
-                                                    std::to_string(subdivFactorCoord[0]) + ", " +
-                                                    std::to_string(subdivFactorCoord[1]) + ", " +
-                                                    std::to_string(subdivFactorCoord[2]) + "]");
+        TORCH_CHECK_VALUE(subdivFactorCoord[i] > 0,
+                          "subdiv_factor must be strictly positive. Got [" +
+                              std::to_string(subdivFactorCoord[0]) + ", " +
+                              std::to_string(subdivFactorCoord[1]) + ", " +
+                              std::to_string(subdivFactorCoord[2]) + "]");
     }
 
     GridBatch ret = GridBatch(device(), is_mutable());
@@ -674,11 +741,11 @@ GridBatch GridBatch::subdivided_grid(Vec3iOrScalar subdiv_factor, const torch::o
     return ret;
 }
 
-GridBatch GridBatch::clipped_grid(const Vec3iBatch& ijk_min,
-                                  const Vec3iBatch& ijk_max) const {
-
+GridBatch
+GridBatch::clipped_grid(const Vec3iBatch &ijk_min, const Vec3iBatch &ijk_max) const {
     JaggedTensor activeVoxelMask = FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchActiveVoxelsInBoundsMask<DeviceTag>(*impl(), ijk_min, ijk_max, false);
+        return fvdb::detail::ops::dispatchActiveVoxelsInBoundsMask<DeviceTag>(*impl(), ijk_min,
+                                                                              ijk_max, false);
     });
 
     JaggedTensor activeVoxelCoords = FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
@@ -690,26 +757,29 @@ GridBatch GridBatch::clipped_grid(const Vec3iBatch& ijk_min,
 
     // construct grid from ijk's clipped from original grid
     GridBatch clippedGrid = sparse_grid_from_ijk(activeVoxelMaskCoords, Vec3i(), Vec3i(),
-                                                       voxel_sizes(), origins(), is_mutable());
+                                                 voxel_sizes(), origins(), is_mutable());
 
     return clippedGrid;
 }
 
-std::pair<JaggedTensor, GridBatch> GridBatch::clip(const JaggedTensor& features,
-                                                   const Vec3iBatch& ijk_min,
-                                                   const Vec3iBatch& ijk_max) const {
-
-    TORCH_CHECK_VALUE(features.ldim() == 1,
-        "Expected features to have 1 list dimension, i.e. be a single list of coordinate values, but got", features.ldim(), "list dimensions"
-    );
+std::pair<JaggedTensor, GridBatch>
+GridBatch::clip(const JaggedTensor &features, const Vec3iBatch &ijk_min,
+                const Vec3iBatch &ijk_max) const {
+    TORCH_CHECK_VALUE(
+        features.ldim() == 1,
+        "Expected features to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        features.ldim(), "list dimensions");
 
     impl()->checkDevice(features);
     TORCH_CHECK(features.rsize(0) == total_voxels(), "Value count of features does not match grid");
-    TORCH_CHECK(features.num_outer_lists() == grid_count(), "Batch size of features does not match grid.");
-    TORCH_CHECK(torch::equal(features.joffsets(), impl()->voxelOffsets(false)), "Offsets of features does not match grid.");
+    TORCH_CHECK(features.num_outer_lists() == grid_count(),
+                "Batch size of features does not match grid.");
+    TORCH_CHECK(torch::equal(features.joffsets(), impl()->voxelOffsets(false)),
+                "Offsets of features does not match grid.");
 
     JaggedTensor activeVoxelMask = FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchActiveVoxelsInBoundsMask<DeviceTag>(*impl(), ijk_min, ijk_max, false);
+        return fvdb::detail::ops::dispatchActiveVoxelsInBoundsMask<DeviceTag>(*impl(), ijk_min,
+                                                                              ijk_max, false);
     });
 
     JaggedTensor activeVoxelCoords = FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
@@ -728,10 +798,12 @@ std::pair<JaggedTensor, GridBatch> GridBatch::clip(const JaggedTensor& features,
     return std::make_pair(clippedFeatures, clippedGrid);
 }
 
-std::vector<JaggedTensor> GridBatch::marching_cubes(const JaggedTensor& field, double level) const {
-    TORCH_CHECK_VALUE(field.ldim() == 1,
-        "Expected field to have 1 list dimension, i.e. be a single list of coordinate values, but got", field.ldim(), "list dimensions"
-    );
+std::vector<JaggedTensor>
+GridBatch::marching_cubes(const JaggedTensor &field, double level) const {
+    TORCH_CHECK_VALUE(
+        field.ldim() == 1,
+        "Expected field to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        field.ldim(), "list dimensions");
     TORCH_CHECK_TYPE(field.is_floating_point(), "field must have a floating point type");
     TORCH_CHECK_VALUE(field.numel() == total_voxels(), "Value count not match!");
     TORCH_CHECK_VALUE(field.num_outer_lists() == grid_count(), "Batch size not match!");
@@ -742,30 +814,37 @@ std::vector<JaggedTensor> GridBatch::marching_cubes(const JaggedTensor& field, d
     if (fieldJdata.dim() != 1) {
         fieldJdata = fieldJdata.squeeze();
     }
-    TORCH_CHECK(fieldJdata.dim() == 1, std::string("Expected field to have 1 effective dimension but got ") +
-                                  std::to_string(field.rdim()) + " dimensions");
+    TORCH_CHECK(fieldJdata.dim() == 1,
+                std::string("Expected field to have 1 effective dimension but got ") +
+                    std::to_string(field.rdim()) + " dimensions");
     impl()->checkDevice(field);
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchMarchingCubes<DeviceTag>(*impl(), fieldJdata, level);
     });
 }
 
-JaggedTensor GridBatch::sparse_conv_halo(const JaggedTensor& input, const torch::Tensor& weight, int variant) const {
-    TORCH_CHECK_VALUE(input.ldim() == 1,
-        "Expected input to have 1 list dimension, i.e. be a single list of coordinate values, but got", input.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::sparse_conv_halo(const JaggedTensor &input, const torch::Tensor &weight,
+                            int variant) const {
+    TORCH_CHECK_VALUE(
+        input.ldim() == 1,
+        "Expected input to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        input.ldim(), "list dimensions");
     TORCH_CHECK_TYPE(input.is_floating_point(), "input must have a floating point type");
     TORCH_CHECK_VALUE(input.rsize(0) == total_voxels(), "Value count not match!");
     TORCH_CHECK_VALUE(input.num_outer_lists() == grid_count(), "Batch size not match!");
     impl()->checkDevice(input);
-    torch::Tensor ret = detail::autograd::SparseConvolutionHalo::apply(impl(), input.jdata(), weight, variant)[0];
+    torch::Tensor ret =
+        detail::autograd::SparseConvolutionHalo::apply(impl(), input.jdata(), weight, variant)[0];
     return input.jagged_like(ret);
 }
 
-
-GridBatch GridBatch::conv_grid(Vec3iOrScalar kernel_size, Vec3iOrScalar stride) const {
-    TORCH_CHECK_VALUE(Vec3iOrScalar(0).value() < kernel_size.value(), "kernel_size must be strictly positive. Got " + kernel_size.toString());
-    TORCH_CHECK_VALUE(Vec3iOrScalar(0).value() < stride.value(), "stride must be strictly positive. Got " + stride.toString());
+GridBatch
+GridBatch::conv_grid(Vec3iOrScalar kernel_size, Vec3iOrScalar stride) const {
+    TORCH_CHECK_VALUE(Vec3iOrScalar(0).value() < kernel_size.value(),
+                      "kernel_size must be strictly positive. Got " + kernel_size.toString());
+    TORCH_CHECK_VALUE(Vec3iOrScalar(0).value() < stride.value(),
+                      "stride must be strictly positive. Got " + stride.toString());
     GridBatch ret = GridBatch(device(), is_mutable());
     if (grid_count() == 0) {
         return ret;
@@ -773,28 +852,36 @@ GridBatch GridBatch::conv_grid(Vec3iOrScalar kernel_size, Vec3iOrScalar stride)
     std::vector<nanovdb::Vec3d> voxS, voxO;
     impl()->gridVoxelSizesAndOrigins(voxS, voxO);
     ret.mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-            detail::build::buildConvGridFromGrid(ret.is_mutable(), *impl(), kernel_size.value(), stride.value()), voxS, voxO);
-    ret.impl()->setCoarseTransformFromFineGrid(*impl(), nanovdb::Coord(stride.value().x(), stride.value().y(), stride.value().z()));
+        detail::build::buildConvGridFromGrid(ret.is_mutable(), *impl(), kernel_size.value(),
+                                             stride.value()),
+        voxS, voxO);
+    ret.impl()->setCoarseTransformFromFineGrid(
+        *impl(), nanovdb::Coord(stride.value().x(), stride.value().y(), stride.value().z()));
     return ret;
 }
 
-void GridBatch::buildCoarseFromFineGrid(const GridBatch& fineGrid, nanovdb::Coord branchFactor) {
+void
+GridBatch::buildCoarseFromFineGrid(const GridBatch &fineGrid, nanovdb::Coord branchFactor) {
     std::vector<nanovdb::Vec3d> voxS, voxO;
     fineGrid.impl()->gridVoxelSizesAndOrigins(voxS, voxO);
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildCoarseGridFromFineGrid(is_mutable(), *fineGrid.impl(), branchFactor),
-         voxS, voxO);
+        detail::build::buildCoarseGridFromFineGrid(is_mutable(), *fineGrid.impl(), branchFactor),
+        voxS, voxO);
     impl()->setCoarseTransformFromFineGrid(*fineGrid.impl(), branchFactor);
 }
 
-
-void GridBatch::buildFineFromCoarseGrid(const GridBatch& coarseGrid, const torch::optional<JaggedTensor>& subdivMask, nanovdb::Coord subdivFactor) {
+void
+GridBatch::buildFineFromCoarseGrid(const GridBatch                     &coarseGrid,
+                                   const torch::optional<JaggedTensor> &subdivMask,
+                                   nanovdb::Coord                       subdivFactor) {
     if (subdivMask.has_value()) {
-        TORCH_CHECK_VALUE(subdivMask.value().ldim() == 1,
-            "Expected subdiv_mask to have 1 list dimension, i.e. be a single list of coordinate values, but got", subdivMask.value().ldim(), "list dimensions"
-        );
+        TORCH_CHECK_VALUE(
+            subdivMask.value().ldim() == 1,
+            "Expected subdiv_mask to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+            subdivMask.value().ldim(), "list dimensions");
         impl()->checkDevice(subdivMask.value());
-        TORCH_CHECK(subdivMask.value().jdata().sizes().size() == 1, "subdivision mask must have 1 dimension");
+        TORCH_CHECK(subdivMask.value().jdata().sizes().size() == 1,
+                    "subdivision mask must have 1 dimension");
         TORCH_CHECK(subdivMask.value().jdata().size(0) == coarseGrid.total_voxels(),
                     "subdivision mask must be either empty tensor or have one entry per voxel");
         TORCH_CHECK(subdivMask.value().scalar_type() == torch::kBool,
@@ -804,269 +891,295 @@ void GridBatch::buildFineFromCoarseGrid(const GridBatch& coarseGrid, const torch
     std::vector<nanovdb::Vec3d> voxS, voxO;
     coarseGrid.impl()->gridVoxelSizesAndOrigins(voxS, voxO);
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-         detail::build::buildFineGridFromCoarseGrid(is_mutable(), *coarseGrid.impl(), subdivMask, subdivFactor),
-         voxS, voxO);
+        detail::build::buildFineGridFromCoarseGrid(is_mutable(), *coarseGrid.impl(), subdivMask,
+                                                   subdivFactor),
+        voxS, voxO);
     impl()->setFineTransformFromCoarseGrid(*coarseGrid.impl(), subdivFactor);
 }
 
-
-void GridBatch::buildDualFromPrimalGrid(const GridBatch& primalGrid, bool excludeBorder) {
+void
+GridBatch::buildDualFromPrimalGrid(const GridBatch &primalGrid, bool excludeBorder) {
     std::vector<nanovdb::Vec3d> voxS, voxO;
     primalGrid.impl()->gridVoxelSizesAndOrigins(voxS, voxO);
     mImpl = c10::make_intrusive<detail::GridBatchImpl>(
-        detail::build::buildPaddedGridFromGrid(is_mutable(), *primalGrid.impl(), 0, 1, excludeBorder),
+        detail::build::buildPaddedGridFromGrid(is_mutable(), *primalGrid.impl(), 0, 1,
+                                               excludeBorder),
         voxS, voxO);
     impl()->setPrimalTransformFromDualGrid(*primalGrid.impl());
 }
 
-
-std::vector<JaggedTensor> GridBatch::voxels_along_rays(const JaggedTensor& ray_origins,
-                                                       const JaggedTensor& ray_directions,
-                                                       int64_t max_vox, double eps,
-                                                       bool return_ijk, bool cumulative) const {
-    TORCH_CHECK_VALUE(ray_origins.ldim() == 1,
-        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_origins.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(ray_directions.ldim() == 1,
-        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_directions.ldim(), "list dimensions"
-    );
+std::vector<JaggedTensor>
+GridBatch::voxels_along_rays(const JaggedTensor &ray_origins, const JaggedTensor &ray_directions,
+                             int64_t max_vox, double eps, bool return_ijk, bool cumulative) const {
+    TORCH_CHECK_VALUE(
+        ray_origins.ldim() == 1,
+        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_origins.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        ray_directions.ldim() == 1,
+        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_directions.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchVoxelsAlongRays<DeviceTag>(*impl(), ray_origins, ray_directions, max_vox, eps, return_ijk, cumulative);
+        return fvdb::detail::ops::dispatchVoxelsAlongRays<DeviceTag>(
+            *impl(), ray_origins, ray_directions, max_vox, eps, return_ijk, cumulative);
     });
 }
 
-
-JaggedTensor GridBatch::segments_along_rays(const JaggedTensor& ray_origins,
-                                            const JaggedTensor& ray_directions,
-                                            int64_t max_segments, double eps, bool ignore_masked) const {
-    TORCH_CHECK_VALUE(ray_origins.ldim() == 1,
-        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_origins.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(ray_directions.ldim() == 1,
-        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_directions.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::segments_along_rays(const JaggedTensor &ray_origins, const JaggedTensor &ray_directions,
+                               int64_t max_segments, double eps, bool ignore_masked) const {
+    TORCH_CHECK_VALUE(
+        ray_origins.ldim() == 1,
+        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_origins.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        ray_directions.ldim() == 1,
+        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_directions.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchSegmentsAlongRays<DeviceTag>(*impl(), ray_origins, ray_directions, max_segments, eps, ignore_masked);
+        return fvdb::detail::ops::dispatchSegmentsAlongRays<DeviceTag>(
+            *impl(), ray_origins, ray_directions, max_segments, eps, ignore_masked);
     });
 }
 
-
-JaggedTensor GridBatch::ray_implicit_intersection(const JaggedTensor& ray_origins,
-                                                  const JaggedTensor& ray_directions,
-                                                  const JaggedTensor& gridScalars,
-                                                  double eps) const {
-    TORCH_CHECK_VALUE(ray_origins.ldim() == 1,
-        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_origins.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(ray_directions.ldim() == 1,
-        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_directions.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(gridScalars.ldim() == 1,
-        "Expected grid_scalars to have 1 list dimension, i.e. be a single list of coordinate values, but got", gridScalars.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::ray_implicit_intersection(const JaggedTensor &ray_origins,
+                                     const JaggedTensor &ray_directions,
+                                     const JaggedTensor &gridScalars, double eps) const {
+    TORCH_CHECK_VALUE(
+        ray_origins.ldim() == 1,
+        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_origins.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        ray_directions.ldim() == 1,
+        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_directions.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        gridScalars.ldim() == 1,
+        "Expected grid_scalars to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        gridScalars.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchRayImplicitIntersection<DeviceTag>(*impl(), ray_origins, ray_directions, gridScalars, eps);
+        return fvdb::detail::ops::dispatchRayImplicitIntersection<DeviceTag>(
+            *impl(), ray_origins, ray_directions, gridScalars, eps);
     });
 }
 
-
-JaggedTensor GridBatch::uniform_ray_samples(const JaggedTensor& ray_origins,
-                                            const JaggedTensor& ray_directions,
-                                            const JaggedTensor& t_min,
-                                            const JaggedTensor& t_max,
-                                            double step_size,
-                                            double cone_angle,
-                                            bool include_end_segments,
-                                            bool return_midpoint,
-                                            double eps) const {
-    TORCH_CHECK_VALUE(ray_origins.ldim() == 1,
-        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_origins.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(ray_directions.ldim() == 1,
-        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got", ray_directions.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(t_min.ldim() == 1,
-        "Expected t_min to have 1 list dimension, i.e. be a single list of coordinate values, but got", t_min.ldim(), "list dimensions"
-    );
-    TORCH_CHECK_VALUE(t_max.ldim() == 1,
-        "Expected t_max to have 1 list dimension, i.e. be a single list of coordinate values, but got", t_max.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::uniform_ray_samples(const JaggedTensor &ray_origins, const JaggedTensor &ray_directions,
+                               const JaggedTensor &t_min, const JaggedTensor &t_max,
+                               double step_size, double cone_angle, bool include_end_segments,
+                               bool return_midpoint, double eps) const {
+    TORCH_CHECK_VALUE(
+        ray_origins.ldim() == 1,
+        "Expected ray_origins to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_origins.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        ray_directions.ldim() == 1,
+        "Expected ray_directions to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ray_directions.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        t_min.ldim() == 1,
+        "Expected t_min to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        t_min.ldim(), "list dimensions");
+    TORCH_CHECK_VALUE(
+        t_max.ldim() == 1,
+        "Expected t_max to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        t_max.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchUniformRaySamples<DeviceTag>(*impl(), ray_origins, ray_directions, t_min, t_max, step_size, cone_angle, include_end_segments, return_midpoint, eps);
+        return fvdb::detail::ops::dispatchUniformRaySamples<DeviceTag>(
+            *impl(), ray_origins, ray_directions, t_min, t_max, step_size, cone_angle,
+            include_end_segments, return_midpoint, eps);
     });
 }
 
-
-JaggedTensor GridBatch::neighbor_indexes(const JaggedTensor& ijk, int32_t extent, int32_t bitshift) const {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::neighbor_indexes(const JaggedTensor &ijk, int32_t extent, int32_t bitshift) const {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     TORCH_CHECK_VALUE(extent >= 0, "extent must be >= 0");
     nanovdb::Coord extentMin(-extent, -extent, -extent);
     nanovdb::Coord extentMax(extent, extent, extent);
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchVoxelNeighborhood<DeviceTag>(*impl(), ijk, extentMin, extentMax, bitshift);
+        return fvdb::detail::ops::dispatchVoxelNeighborhood<DeviceTag>(*impl(), ijk, extentMin,
+                                                                       extentMax, bitshift);
     });
 }
 
-
-JaggedTensor GridBatch::points_in_active_voxel(const JaggedTensor& xyz, bool ignore_disabled) const {
-    TORCH_CHECK_VALUE(xyz.ldim() == 1,
-        "Expected xyz to have 1 list dimension, i.e. be a single list of coordinate values, but got", xyz.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::points_in_active_voxel(const JaggedTensor &xyz, bool ignore_disabled) const {
+    TORCH_CHECK_VALUE(
+        xyz.ldim() == 1,
+        "Expected xyz to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        xyz.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchPointsInGrid<DeviceTag>(*impl(), xyz, ignore_disabled);
     });
 }
 
-
-JaggedTensor GridBatch::cubes_intersect_grid(const JaggedTensor& cube_centers,
-                                             const Vec3dOrScalar& cube_min,
-                                             const Vec3dOrScalar& cube_max,
-                                             bool ignore_disabled) const {
-    TORCH_CHECK_VALUE(cube_centers.ldim() == 1,
-        "Expected cube_centers to have 1 list dimension, i.e. be a single list of coordinate values, but got", cube_centers.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::cubes_intersect_grid(const JaggedTensor &cube_centers, const Vec3dOrScalar &cube_min,
+                                const Vec3dOrScalar &cube_max, bool ignore_disabled) const {
+    TORCH_CHECK_VALUE(
+        cube_centers.ldim() == 1,
+        "Expected cube_centers to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        cube_centers.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchCubesIntersectGrid<DeviceTag>(*impl(), cube_centers, cube_min, cube_max, ignore_disabled);
+        return fvdb::detail::ops::dispatchCubesIntersectGrid<DeviceTag>(
+            *impl(), cube_centers, cube_min, cube_max, ignore_disabled);
     });
 }
 
-
-JaggedTensor GridBatch::cubes_in_grid(const JaggedTensor& cube_centers,
-                                      const Vec3dOrScalar& cube_min,
-                                      const Vec3dOrScalar& cube_max,
-                                      bool ignore_disabled) const {
-    TORCH_CHECK_VALUE(cube_centers.ldim() == 1,
-        "Expected cube_centers to have 1 list dimension, i.e. be a single list of coordinate values, but got", cube_centers.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::cubes_in_grid(const JaggedTensor &cube_centers, const Vec3dOrScalar &cube_min,
+                         const Vec3dOrScalar &cube_max, bool ignore_disabled) const {
+    TORCH_CHECK_VALUE(
+        cube_centers.ldim() == 1,
+        "Expected cube_centers to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        cube_centers.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchCubesInGrid<DeviceTag>(*impl(), cube_centers, cube_min, cube_max, ignore_disabled);
+        return fvdb::detail::ops::dispatchCubesInGrid<DeviceTag>(*impl(), cube_centers, cube_min,
+                                                                 cube_max, ignore_disabled);
     });
 }
 
-
-JaggedTensor GridBatch::enabled_mask() const {
+JaggedTensor
+GridBatch::enabled_mask() const {
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchEnabledMask<DeviceTag>(*impl(), false);
     });
 }
 
-JaggedTensor GridBatch::disabled_mask() const {
+JaggedTensor
+GridBatch::disabled_mask() const {
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchEnabledMask<DeviceTag>(*impl(), true);
     });
 }
 
-
-JaggedTensor GridBatch::coords_in_active_voxel(const JaggedTensor& ijk, bool ignore_disabled) const {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::coords_in_active_voxel(const JaggedTensor &ijk, bool ignore_disabled) const {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchCoordsInGrid<DeviceTag>(*impl(), ijk, ignore_disabled);
     });
 }
 
-
-JaggedTensor GridBatch::ijk_to_index(const JaggedTensor& ijk, bool cumulative) const {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::ijk_to_index(const JaggedTensor &ijk, bool cumulative) const {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchIjkToIndex<DeviceTag>(*impl(), ijk, cumulative);
     });
 }
 
-
-JaggedTensor GridBatch::ijk_to_inv_index(const JaggedTensor& ijk, bool cumulative) const {
-    TORCH_CHECK_VALUE(ijk.ldim() == 1,
-        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got", ijk.ldim(), "list dimensions"
-    );
+JaggedTensor
+GridBatch::ijk_to_inv_index(const JaggedTensor &ijk, bool cumulative) const {
+    TORCH_CHECK_VALUE(
+        ijk.ldim() == 1,
+        "Expected ijk to have 1 list dimension, i.e. be a single list of coordinate values, but got",
+        ijk.ldim(), "list dimensions");
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         return fvdb::detail::ops::dispatchIjkToInvIndex<DeviceTag>(*impl(), ijk, cumulative);
     });
 }
 
-
-JaggedTensor GridBatch::ijk() const {
+JaggedTensor
+GridBatch::ijk() const {
     return FVDB_DISPATCH_KERNEL_DEVICE(this->device(), [&]() {
         return fvdb::detail::ops::dispatchActiveGridCoords<DeviceTag>(*impl(), true);
     });
 }
 
-JaggedTensor GridBatch::ijk_enabled() const {
+JaggedTensor
+GridBatch::ijk_enabled() const {
     return FVDB_DISPATCH_KERNEL_DEVICE(this->device(), [&]() {
         return fvdb::detail::ops::dispatchActiveGridCoords<DeviceTag>(*impl(), false);
     });
 }
 
-
-const torch::Tensor GridBatch::bbox() const {
+const torch::Tensor
+GridBatch::bbox() const {
     const int64_t bs = grid_count();
-    torch::Tensor ret = torch::zeros({bs, 2, 3}, torch::TensorOptions().device(device()).dtype(torch::kInt32));
+    torch::Tensor ret =
+        torch::zeros({ bs, 2, 3 }, torch::TensorOptions().device(device()).dtype(torch::kInt32));
     for (int64_t i = 0; i < bs; ++i) {
-        const nanovdb::CoordBBox& bbox = impl()->bbox(i);
-        ret[i][0][0] = bbox.min()[0];
-        ret[i][0][1] = bbox.min()[1];
-        ret[i][0][2] = bbox.min()[2];
-        ret[i][1][0] = bbox.max()[0];
-        ret[i][1][1] = bbox.max()[1];
-        ret[i][1][2] = bbox.max()[2];
+        const nanovdb::CoordBBox &bbox = impl()->bbox(i);
+        ret[i][0][0]                   = bbox.min()[0];
+        ret[i][0][1]                   = bbox.min()[1];
+        ret[i][0][2]                   = bbox.min()[2];
+        ret[i][1][0]                   = bbox.max()[0];
+        ret[i][1][1]                   = bbox.max()[1];
+        ret[i][1][2]                   = bbox.max()[2];
     }
     return ret;
 }
 
-const torch::Tensor GridBatch::bbox_at(int64_t bi) const {
-    torch::Tensor ret = torch::zeros({2, 3}, torch::TensorOptions().device(device()).dtype(torch::kInt32));
-    const nanovdb::CoordBBox& bbox = impl()->bbox(bi);
-    ret[0][0] = bbox.min()[0];
-    ret[0][1] = bbox.min()[1];
-    ret[0][2] = bbox.min()[2];
-    ret[1][0] = bbox.max()[0];
-    ret[1][1] = bbox.max()[1];
-    ret[1][2] = bbox.max()[2];
+const torch::Tensor
+GridBatch::bbox_at(int64_t bi) const {
+    torch::Tensor ret =
+        torch::zeros({ 2, 3 }, torch::TensorOptions().device(device()).dtype(torch::kInt32));
+    const nanovdb::CoordBBox &bbox = impl()->bbox(bi);
+    ret[0][0]                      = bbox.min()[0];
+    ret[0][1]                      = bbox.min()[1];
+    ret[0][2]                      = bbox.min()[2];
+    ret[1][0]                      = bbox.max()[0];
+    ret[1][1]                      = bbox.max()[1];
+    ret[1][2]                      = bbox.max()[2];
     return ret;
 }
 
-const torch::Tensor GridBatch::dual_bbox() const {
+const torch::Tensor
+GridBatch::dual_bbox() const {
     const int64_t bs = grid_count();
-    torch::Tensor ret = torch::zeros({bs, 2, 3}, torch::TensorOptions().device(device()).dtype(torch::kInt32));
+    torch::Tensor ret =
+        torch::zeros({ bs, 2, 3 }, torch::TensorOptions().device(device()).dtype(torch::kInt32));
     for (int64_t i = 0; i < bs; ++i) {
-        const nanovdb::CoordBBox& bbox = impl()->dualBbox(i);
-        ret[i][0][0] = bbox.min()[0];
-        ret[i][0][1] = bbox.min()[1];
-        ret[i][0][2] = bbox.min()[2];
-        ret[i][1][0] = bbox.max()[0];
-        ret[i][1][1] = bbox.max()[1];
-        ret[i][1][2] = bbox.max()[2];
+        const nanovdb::CoordBBox &bbox = impl()->dualBbox(i);
+        ret[i][0][0]                   = bbox.min()[0];
+        ret[i][0][1]                   = bbox.min()[1];
+        ret[i][0][2]                   = bbox.min()[2];
+        ret[i][1][0]                   = bbox.max()[0];
+        ret[i][1][1]                   = bbox.max()[1];
+        ret[i][1][2]                   = bbox.max()[2];
     }
     return ret;
 }
 
-const torch::Tensor GridBatch::dual_bbox_at(int64_t bi) const {
-    torch::Tensor ret = torch::zeros({2, 3}, torch::TensorOptions().device(device()).dtype(torch::kInt32));
-    const nanovdb::CoordBBox& bbox = impl()->dualBbox(bi);
-    ret[0][0] = bbox.min()[0];
-    ret[0][1] = bbox.min()[1];
-    ret[0][2] = bbox.min()[2];
-    ret[1][0] = bbox.max()[0];
-    ret[1][1] = bbox.max()[1];
-    ret[1][2] = bbox.max()[2];
+const torch::Tensor
+GridBatch::dual_bbox_at(int64_t bi) const {
+    torch::Tensor ret =
+        torch::zeros({ 2, 3 }, torch::TensorOptions().device(device()).dtype(torch::kInt32));
+    const nanovdb::CoordBBox &bbox = impl()->dualBbox(bi);
+    ret[0][0]                      = bbox.min()[0];
+    ret[0][1]                      = bbox.min()[1];
+    ret[0][2]                      = bbox.min()[2];
+    ret[1][0]                      = bbox.max()[0];
+    ret[1][1]                      = bbox.max()[1];
+    ret[1][2]                      = bbox.max()[2];
     return ret;
 }
 
-const torch::Tensor GridBatch::total_bbox() const {
-    const nanovdb::CoordBBox& bbox = impl()->totalBBox();
-    return torch::tensor({{bbox.min()[0], bbox.min()[1], bbox.min()[2]},
-                            {bbox.max()[0], bbox.max()[1], bbox.max()[2]}},
-                            torch::TensorOptions().device(device()).dtype(torch::kInt32));
+const torch::Tensor
+GridBatch::total_bbox() const {
+    const nanovdb::CoordBBox &bbox = impl()->totalBBox();
+    return torch::tensor({ { bbox.min()[0], bbox.min()[1], bbox.min()[2] },
+                           { bbox.max()[0], bbox.max()[1], bbox.max()[2] } },
+                         torch::TensorOptions().device(device()).dtype(torch::kInt32));
 }
 
-
-std::vector<JaggedTensor> GridBatch::viz_edge_network(bool returnVoxelCoordinates) const {
+std::vector<JaggedTensor>
+GridBatch::viz_edge_network(bool returnVoxelCoordinates) const {
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return fvdb::detail::ops::dispatchGridEdgeNetwork<DeviceTag>(*impl(), returnVoxelCoordinates);
+        return fvdb::detail::ops::dispatchGridEdgeNetwork<DeviceTag>(*impl(),
+                                                                     returnVoxelCoordinates);
     });
 }
 
diff --git a/fvdb/src/GridBatch.h b/fvdb/src/GridBatch.h
index 5743cabd4e..796f165192 100644
--- a/fvdb/src/GridBatch.h
+++ b/fvdb/src/GridBatch.h
@@ -1,25 +1,24 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-#include <nanovdb/NanoVDB.h>
-#include <nanovdb/io/IO.h>
-
-#include <torch/script.h>
-#include <torch/custom_class.h>
+#ifndef FVDB_GRIDBATCH_H
+#define FVDB_GRIDBATCH_H
 
-#include "detail/utils/Utils.h"
 #include "detail/GridBatchImpl.h"
+#include "detail/utils/Utils.h"
 
 #include "JaggedTensor.h"
 #include "Types.h"
 
-namespace fvdb {
+#include <nanovdb/NanoVDB.h>
+#include <nanovdb/io/IO.h>
 
+#include <torch/custom_class.h>
+#include <torch/script.h>
 
+namespace fvdb {
 
 struct GridBatch : torch::CustomClassHolder {
-
     // Set some speed limits so you don't shoot yourself in the foot
     constexpr static int64_t MAX_GRIDS_PER_BATCH = 1024;
 
@@ -30,63 +29,74 @@ struct GridBatch : torch::CustomClassHolder {
 
     /// @brief Return true if this is a contiguous view of the grid batch
     /// @return true if this is a contiguous view of the grid batch
-    bool is_contiguous() const {
+    bool
+    is_contiguous() const {
         return impl()->isContiguous();
     }
 
     /// @brief Return a contiguous copy of this grid batch. If the grid batch is already contiguous,
     ///        then return a reference to this
     /// @return A contiguous copy of this grid batch
-    GridBatch contiguous() const {
+    GridBatch
+    contiguous() const {
         return GridBatch(detail::GridBatchImpl::contiguous(impl()));
     }
 
-    /// @brief Get the voxel size of the bi^th grid in the batch and return is a tensor of type dtype
+    /// @brief Get the voxel size of the bi^th grid in the batch and return is a tensor of type
+    /// dtype
     /// @param bi The batch index of the grid for which to get the voxel size
     /// @param dtype The dtype of the returned tensor
     /// @return A tensor of shape [3,] containing the voxel size of the bi^th grid in the batch
-    torch::Tensor voxel_size_at(int64_t bi, const torch::Dtype& dtype = torch::kFloat32) const;
+    torch::Tensor voxel_size_at(int64_t bi, const torch::Dtype &dtype = torch::kFloat32) const;
 
-    /// @brief Get the voxel origin of the bi^th grid in the batch and return is a tensor of type dtype
+    /// @brief Get the voxel origin of the bi^th grid in the batch and return is a tensor of type
+    /// dtype
     /// @param bi The batch index of the grid for which to get the voxel origin
     /// @param dtype The dtype of the returned tensor
     /// @return A tensor of shape [3,] containing the voxel origin of the bi^th grid in the batch
-    torch::Tensor origin_at(int64_t bi, const torch::Dtype& dtype = torch::kFloat32) const;
+    torch::Tensor origin_at(int64_t bi, const torch::Dtype &dtype = torch::kFloat32) const;
 
     /// @brief Get the voxel size of all grids in this batch and return is a tensor of type dtype
     /// @param dtype The dtype of the returned tensor
-    /// @return A tensor of shape [grid_count(), 3] containing the voxel size of all grids indexed by this batch
-    torch::Tensor voxel_sizes(const torch::Dtype& dtype = torch::kFloat32) const;
+    /// @return A tensor of shape [grid_count(), 3] containing the voxel size of all grids indexed
+    /// by this batch
+    torch::Tensor voxel_sizes(const torch::Dtype &dtype = torch::kFloat32) const;
 
     /// @brief Get the voxel origins of all grids in this batch and return is a tensor of type dtype
     /// @param dtype The dtype of the returned tensor
-    /// @return A tensor of shape [grid_count(), 3] containing the voxel origins of all grids indexed by this batch
-    torch::Tensor origins(const torch::Dtype& dtype = torch::kFloat32) const;
+    /// @return A tensor of shape [grid_count(), 3] containing the voxel origins of all grids
+    /// indexed by this batch
+    torch::Tensor origins(const torch::Dtype &dtype = torch::kFloat32) const;
 
     /// @brief Get the number of grids indexed by this batch
     /// @return The number of grids indexed by this batch
-    int64_t grid_count() const {
-        TORCH_CHECK(impl()->batchSize() <= MAX_GRIDS_PER_BATCH, "Cannot have more than ", MAX_GRIDS_PER_BATCH, " grids in a batch");
+    int64_t
+    grid_count() const {
+        TORCH_CHECK(impl()->batchSize() <= MAX_GRIDS_PER_BATCH, "Cannot have more than ",
+                    MAX_GRIDS_PER_BATCH, " grids in a batch");
         return impl()->batchSize();
     }
 
     /// @brief The total number of enabled voxels indexed by this batch of grids
     ///        For immutable grids, this returns the same value as total_voxels()
     /// @return The total number of enabled voxels indexed by this batch of grids
-    int64_t total_enabled_voxels() const {
+    int64_t
+    total_enabled_voxels() const {
         return impl()->totalEnabledVoxels(false);
     }
 
     /// @brief Get the total number of voxels indexed by this batch of grids
     /// @return The total number of voxels indexed by this batch of grids
-    int64_t total_voxels() const {
+    int64_t
+    total_voxels() const {
         return impl()->totalVoxels();
     }
 
     /// @brief Get the number of voxels indexed by the bi^th grid in the batch
     /// @param bi The batch index of the grid for which to get the number of voxels
     /// @return The number of voxels indexed by the bi^th grid in the batch
-    int64_t num_voxels_at(int64_t bi) const {
+    int64_t
+    num_voxels_at(int64_t bi) const {
         return impl()->numVoxels(bi);
     }
 
@@ -99,7 +109,8 @@ struct GridBatch : torch::CustomClassHolder {
     /// @brief Get the cumulative number of voxels indexed by the first bi+1 grids
     /// @param bi The batch index
     /// @return The cumulative number of voxels indexed by the first bi+1 grids
-    int64_t cum_voxels_at(int64_t bi) const {
+    int64_t
+    cum_voxels_at(int64_t bi) const {
         return impl()->cumVoxels(bi);
     }
 
@@ -115,22 +126,26 @@ struct GridBatch : torch::CustomClassHolder {
 
     /// @brief Get the number of enabled voxels indexed by this batch of grids
     ///        For immutable grids, this returns the same value as num_voxels()
-    /// @return An integer tensor containing the number of enabled voxels per grid indexed by this batch
+    /// @return An integer tensor containing the number of enabled voxels per grid indexed by this
+    /// batch
     torch::Tensor num_enabled_voxels() const;
 
     /// @brief Get the cumulative number of voxels indexed by the grids in this batch
     ///        i.e. [nvox_0, nvox_0+nvox_1, nvox_0+nvox_1+nvox_2, ...]
-    /// @return An integer tensor containing the cumulative number of voxels indexed by the grids in this batch
+    /// @return An integer tensor containing the cumulative number of voxels indexed by the grids in
+    /// this batch
     torch::Tensor cum_voxels() const;
 
     /// @brief Get the cumulative number of voxels indexed by the grids in this batch
     ///        i.e. [nvox_0, nvox_0+nvox_1, nvox_0+nvox_1+nvox_2, ...]
-    /// @return An integer tensor containing the cumulative number of voxels indexed by the grids in this batch
+    /// @return An integer tensor containing the cumulative number of voxels indexed by the grids in
+    /// this batch
     torch::Tensor cum_enabled_voxels() const;
 
     /// @brief Get the total number of bytes required to store all grids indexed by this batch
     /// @return The total number of bytes required to store all grids indexed by this batch
-    int64_t total_bytes() const {
+    int64_t
+    total_bytes() const {
         return impl()->totalBytes();
     }
 
@@ -140,7 +155,8 @@ struct GridBatch : torch::CustomClassHolder {
 
     /// @brief Get the total number of leaf nodes indexed by this batch of grids
     /// @return The total number of leaf nodes indexed by this batch of grids
-    int64_t total_leaf_nodes() const {
+    int64_t
+    total_leaf_nodes() const {
         return impl()->totalLeaves();
     }
 
@@ -148,64 +164,78 @@ struct GridBatch : torch::CustomClassHolder {
     /// @return An integer tensor containing the number of leaf nodes in each grid
     torch::Tensor num_leaf_nodes() const;
 
-
     /// @brief Get the offsets of the voxels indexed by this batch of grid
-    /// @return A tensor of shape [batch_size, 2] where the [bi, 0]^th entry is the offset of the first voxel
-    ///         and the [bi, 1]^th entry is the offset one past the last voxel indexed by the bi^th grid in the batch
-    torch::Tensor joffsets() const {
+    /// @return A tensor of shape [batch_size, 2] where the [bi, 0]^th entry is the offset of the
+    /// first voxel
+    ///         and the [bi, 1]^th entry is the offset one past the last voxel indexed by the bi^th
+    ///         grid in the batch
+    torch::Tensor
+    joffsets() const {
         return impl()->voxelOffsets(true);
     }
 
     /// @brief Get the list indices for theis batch of grids
-    /// @return A tensor of shape [total_grids, ldim] where the [i]^th entry is the list index of the i^th grid
-    torch::Tensor jlidx() const {
+    /// @return A tensor of shape [total_grids, ldim] where the [i]^th entry is the list index of
+    /// the i^th grid
+    torch::Tensor
+    jlidx() const {
         const torch::Tensor ret = impl()->jlidx(true);
         if (ret.numel() == 0) {
-            return torch::arange({grid_count()}, torch::TensorOptions().device(device()).dtype(torch::kInt64));
+            return torch::arange({ grid_count() },
+                                 torch::TensorOptions().device(device()).dtype(torch::kInt64));
         } else {
             return ret;
         }
     }
 
     /// @brief Get the batch index for each voxel indexed by this batch of grids
-    /// @return An integer tensor of shape [total_voxels,] where the [i]^th entry is the batch index of the i^th voxel
-    torch::Tensor jidx() const {
+    /// @return An integer tensor of shape [total_voxels,] where the [i]^th entry is the batch index
+    /// of the i^th voxel
+    torch::Tensor
+    jidx() const {
         const torch::Tensor ret = impl()->jidx(true);
         if (grid_count() == 1 && ret.numel() == 0) {
-            return torch::zeros({total_voxels()}, torch::TensorOptions().device(device()).dtype(torch::kInt16));
+            return torch::zeros({ total_voxels() },
+                                torch::TensorOptions().device(device()).dtype(torch::kInt16));
         } else {
             return ret;
         }
-
     }
 
     /// @brief Set the voxel size of all grids indexed by this batch to the specified value
     /// @param voxel_size A 3D (shape [3,]) tensor specifying the voxel size to set for each grid
-    inline void set_global_voxel_size(const Vec3dOrScalar& voxel_size) {
+    inline void
+    set_global_voxel_size(const Vec3dOrScalar &voxel_size) {
         impl()->setGlobalVoxelSize(voxel_size.value());
     }
 
     /// @brief Set the voxel origin of all grids indexed by this batch to the specified value
     /// @param origin A 3D (shape [3,]) tensor specifying the voxel origin to set for each grid
-    inline void set_global_origin(const Vec3d& origin) {
+    inline void
+    set_global_origin(const Vec3d &origin) {
         impl()->setGlobalVoxelOrigin(origin.value());
     }
 
     /// @brief Return true if this grid is mutable
     /// @return Whether the grid is mutable
-    inline bool is_mutable() const {
+    inline bool
+    is_mutable() const {
         return impl()->isMutable();
     }
 
     /// @brief Get the device on which this grid is stored
     /// @return The device on which this grid is stored
-    inline c10::Device device() const {
+    inline c10::Device
+    device() const {
         return impl()->device();
     }
 
-    /// @brief Get the primal transforms of the grids in this batch (i.e. world to primal grid coordinates)
-    /// @return A std::vector<VoxelCoordTransform> containing the primal transforms of the grids in this batch
-    inline const std::vector<detail::VoxelCoordTransform> primal_transforms() const {
+    /// @brief Get the primal transforms of the grids in this batch (i.e. world to primal grid
+    /// coordinates)
+    /// @return A std::vector<VoxelCoordTransform> containing the primal transforms of the grids in
+    /// this batch
+    inline const std::vector<detail::VoxelCoordTransform>
+    primal_transforms() const {
         std::vector<detail::VoxelCoordTransform> transforms;
         transforms.reserve(grid_count());
         for (int64_t bi = 0; bi < grid_count(); ++bi) {
@@ -214,9 +244,12 @@ struct GridBatch : torch::CustomClassHolder {
         return transforms;
     }
 
-    /// @brief Get the dual transforms of the grids in this batch (i.e. world to dual grid coordinates)
-    /// @return A std::vector<detail::VoxelCoordTransform> containing the dual transforms of the grids in this batch
-    inline const std::vector<detail::VoxelCoordTransform> dual_transforms() const {
+    /// @brief Get the dual transforms of the grids in this batch (i.e. world to dual grid
+    /// coordinates)
+    /// @return A std::vector<detail::VoxelCoordTransform> containing the dual transforms of the
+    /// grids in this batch
+    inline const std::vector<detail::VoxelCoordTransform>
+    dual_transforms() const {
         std::vector<detail::VoxelCoordTransform> transforms;
         transforms.reserve(grid_count());
         for (int64_t bi = 0; bi < grid_count(); ++bi) {
@@ -225,267 +258,330 @@ struct GridBatch : torch::CustomClassHolder {
         return transforms;
     }
 
-    /// @brief Get the primal transform of the bi^th grid in the batch (i.e. world to primal grid coordinates)
+    /// @brief Get the primal transform of the bi^th grid in the batch (i.e. world to primal grid
+    /// coordinates)
     /// @param bi The index of the grid in the batch for which to get the primal transform
     /// @return The primal transform of the bi^th grid in the batch
-    inline const fvdb::detail::VoxelCoordTransform primal_transform_at(int64_t bi) const {
+    inline const fvdb::detail::VoxelCoordTransform
+    primal_transform_at(int64_t bi) const {
         return impl()->primalTransform(bi);
     }
 
-    /// @brief Get the dual transform of the bi^th grid in the batch (i.e. world to dual grid coordinates)
+    /// @brief Get the dual transform of the bi^th grid in the batch (i.e. world to dual grid
+    /// coordinates)
     /// @param bi The index of the grid in the batch for which to get the dual transform
     /// @return The dual transform of the bi^th grid in the batch
-    inline const fvdb::detail::VoxelCoordTransform dual_transform_at(int64_t bi) const {
+    inline const fvdb::detail::VoxelCoordTransform
+    dual_transform_at(int64_t bi) const {
         return impl()->dualTransform(bi);
     }
 
     /// @brief Get the bounding box (in voxel coordinates) for each grid in the batch
     /// @return A tensor bboxes of shape [B, 2, 3] where
-    ///         bboxes[bi] = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding box
-    ///         such that bmin <= ijk < bmax for all voxels ijk in the bi^th grid
+    ///         bboxes[bi] = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th
+    ///         bounding box such that bmin <= ijk < bmax for all voxels ijk in the bi^th grid
     const torch::Tensor bbox() const;
 
     /// @brief Get the bounding box (in voxel coordinates) of the bi^th grid in the batch
     /// @return A tensor, bbox, of shape [2, 3] where
-    ///         bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding box
-    ///         such that bmin <= ijk < bmax for all voxels ijk in the bi^th grid
+    ///         bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding
+    ///         box such that bmin <= ijk < bmax for all voxels ijk in the bi^th grid
     const torch::Tensor bbox_at(int64_t bi) const;
 
     /// @brief Get the bounding box (in voxel coordinates) for the dual of each grid in the batch
     /// @return A tensor bboxes of shape [B, 2, 3] where
-    ///         bboxes[bi] = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding box
-    ///         such that bmin <= ijk < bmax for all voxels ijk in the dual of the bi^th grid
+    ///         bboxes[bi] = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th
+    ///         bounding box such that bmin <= ijk < bmax for all voxels ijk in the dual of the
+    ///         bi^th grid
     const torch::Tensor dual_bbox() const;
 
-    /// @brief Get the bounding box (in voxel coordinates) of the dual of the bi^th grid in the batch
+    /// @brief Get the bounding box (in voxel coordinates) of the dual of the bi^th grid in the
+    /// batch
     /// @return A tensor, bbox, of shape [2, 3] where
-    ///         bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding box
-    ///         such that bmin <= ijk < bmax for all voxels ijk in the dual of the bi^th grid
+    ///         bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bi^th bounding
+    ///         box such that bmin <= ijk < bmax for all voxels ijk in the dual of the bi^th grid
     const torch::Tensor dual_bbox_at(int64_t bi) const;
 
-    /// @brief Get the bounding box (in voxel coordinates) which contains all the grids in this batch
+    /// @brief Get the bounding box (in voxel coordinates) which contains all the grids in this
+    /// batch
     /// @return A tensor, total_bbox, of shape [2, 3] where
-    ///         total_bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bounding box
-    ///         such that bmin <= ijk < bmax for all voxels ijk in the batch
+    ///         total_bbox = [[bmin_i, bmin_j, bmin_z=k], [bmax_i, bmax_j, bmax_k]] is the bounding
+    ///         box such that bmin <= ijk < bmax for all voxels ijk in the batch
     const torch::Tensor total_bbox() const;
 
     /// @brief Downsample this batch of grids using maxpooling
-    /// @param pool_factor How much to pool by (i,e, (2,2,2) means take max over 2x2x2 from start of window)
-    /// @param data Data at each voxel in this grid to be downsampled (JaggedTensor of shape [B, -1, *])
+    /// @param pool_factor How much to pool by (i,e, (2,2,2) means take max over 2x2x2 from start of
+    /// window)
+    /// @param data Data at each voxel in this grid to be downsampled (JaggedTensor of shape [B, -1,
+    /// *])
     /// @param stride The stride to use when pooling
-    /// @param coarse_grid An optional coarse grid used to specify the output. This is mainly used for memory
-    ///                    efficiency so you can chache grids. If you don't pass it in, we'll just create it for you.
-    /// @return A pair (coarseData, coarseGrid) where coarseData is a JaggedTensor of shape [B, -1, *] of downsampled data
+    /// @param coarse_grid An optional coarse grid used to specify the output. This is mainly used
+    /// for memory
+    ///                    efficiency so you can chache grids. If you don't pass it in, we'll just
+    ///                    create it for you.
+    /// @return A pair (coarseData, coarseGrid) where coarseData is a JaggedTensor of shape [B, -1,
+    /// *] of downsampled data
     ///         and coarseGrid is a GridBatch representing the downsampled grid batch
-    std::pair<JaggedTensor, GridBatch> max_pool(Vec3iOrScalar pool_factor,
-                                                const JaggedTensor& data,
-                                                Vec3iOrScalar stride = 0,
-                                                torch::optional<GridBatch> coarse_grid = torch::nullopt) const;
+    std::pair<JaggedTensor, GridBatch>
+    max_pool(Vec3iOrScalar pool_factor, const JaggedTensor &data, Vec3iOrScalar stride = 0,
+             torch::optional<GridBatch> coarse_grid = torch::nullopt) const;
 
     /// @brief Downsample this batch of grids using average pooling
-    /// @param pool_factor How much to pool by (i,e, (2, 2, 2) means take max over 2x2x2 from start of window)
-    /// @param data Data at each voxel in this grid to be downsampled (JaggedTensor of shape [B, -1, *])
+    /// @param pool_factor How much to pool by (i,e, (2, 2, 2) means take max over 2x2x2 from start
+    /// of window)
+    /// @param data Data at each voxel in this grid to be downsampled (JaggedTensor of shape [B, -1,
+    /// *])
     /// @param stride The stride to use when pooling
-    /// @param coarse_grid An optional coarse grid used to specify the output. This is mainly used for memory
-    ///                    efficiency so you can chache grids. If you don't pass it in, we'll just create it for you.
-    /// @return A pair (coarseData, coarseGrid) where coarseData is a JaggedTensor of shape [B, -1, *] of downsampled data
+    /// @param coarse_grid An optional coarse grid used to specify the output. This is mainly used
+    /// for memory
+    ///                    efficiency so you can chache grids. If you don't pass it in, we'll just
+    ///                    create it for you.
+    /// @return A pair (coarseData, coarseGrid) where coarseData is a JaggedTensor of shape [B, -1,
+    /// *] of downsampled data
     ///         and coarseGrid is a GridBatch representing the downsampled grid batch
-    std::pair<JaggedTensor, GridBatch> avg_pool(Vec3iOrScalar pool_factor,
-                                                const JaggedTensor& data,
-                                                Vec3iOrScalar stride = 0,
-                                                torch::optional<GridBatch> coarse_grid = torch::nullopt) const;
+    std::pair<JaggedTensor, GridBatch>
+    avg_pool(Vec3iOrScalar pool_factor, const JaggedTensor &data, Vec3iOrScalar stride = 0,
+             torch::optional<GridBatch> coarse_grid = torch::nullopt) const;
 
     /// @brief Subdivide this batch of grids using nearest neighbor interpolation
     /// @param subdiv_factor How much to upsample by (i,e, (2,2,2) means upsample by 2x2x2)
-    /// @param data Data at each voxel in this grid to be upsampled (JaggedTensor of shape [B, -1, *])
+    /// @param data Data at each voxel in this grid to be upsampled (JaggedTensor of shape [B, -1,
+    /// *])
     /// @param mask An optional mask of shape [B, -1] specifying which coarse voxels to upsample
-    /// @param fine_grid An optional coarse grid used to specify the output. This is mainly used for memory
-    ///                  efficiency so you can chache grids. If you don't pass it in, we'll just create it for you.
-    /// @return A pair (fineData, fineGrid) where fineData is a JaggedTensor of shape [B, -1, *] of upsampled data and
+    /// @param fine_grid An optional coarse grid used to specify the output. This is mainly used for
+    /// memory
+    ///                  efficiency so you can chache grids. If you don't pass it in, we'll just
+    ///                  create it for you.
+    /// @return A pair (fineData, fineGrid) where fineData is a JaggedTensor of shape [B, -1, *] of
+    /// upsampled data and
     ///         fineGrid is a GridBatch representing the upsampled grid batch
-    std::pair<JaggedTensor, GridBatch> subdivide(Vec3iOrScalar subdiv_factor,
-                                                 const JaggedTensor& data,
-                                                 const torch::optional<JaggedTensor> mask = torch::nullopt,
-                                                 torch::optional<GridBatch> fine_grid = torch::nullopt) const;
+    std::pair<JaggedTensor, GridBatch>
+    subdivide(Vec3iOrScalar subdiv_factor, const JaggedTensor &data,
+              const torch::optional<JaggedTensor> mask      = torch::nullopt,
+              torch::optional<GridBatch>          fine_grid = torch::nullopt) const;
 
     /// @brief Read the values from a dense tensor of the voxels at the specified coordinates
     /// @param dense_data A dense tensor of shape [B, W, H, D, *]
-    /// @param dense_origins A tensor of shape [B, 3] or [3,] specifying the voxel coordinate(s) of the origin of the dense tensor i.e. [:, 0, 0, 0]
-    /// @return A JaggedTensor with shape [B, -1, *] containing the values at the specified coordinates
-    JaggedTensor read_from_dense(const torch::Tensor& dense_data,
-                                 const Vec3iBatch& dense_origins = torch::zeros(3, torch::kInt32)) const;
+    /// @param dense_origins A tensor of shape [B, 3] or [3,] specifying the voxel coordinate(s) of
+    /// the origin of the dense tensor i.e. [:, 0, 0, 0]
+    /// @return A JaggedTensor with shape [B, -1, *] containing the values at the specified
+    /// coordinates
+    JaggedTensor
+    read_from_dense(const torch::Tensor &dense_data,
+                    const Vec3iBatch    &dense_origins = torch::zeros(3, torch::kInt32)) const;
 
     /// @brief Read the values from a JaggedTensor indexed by this batch into a dense tensor
-    /// @param sparse_data A JaggedTensor of shape [B, -1, *] containing one value per voxel in the batch
-    /// @param min_coord An optional minimum coordinate to read from the batch (in voxel coordinates).
+    /// @param sparse_data A JaggedTensor of shape [B, -1, *] containing one value per voxel in the
+    /// batch
+    /// @param min_coord An optional minimum coordinate to read from the batch (in voxel
+    /// coordinates).
     ///                  Defaults to the minimum coordinate of the batch.
     /// @param grid_size An optional grid size to read from the batch (in voxel coordinates).
     ///                  Defaults to the total size of a grid containing the whole batch.
-    /// @return A dense tensor of shape [B, W, H, D, *] containing the values at the specified coordinates (and zero elsewhere)
-    torch::Tensor read_into_dense(const JaggedTensor& sparse_data,
-                                  const torch::optional<Vec3iBatch>& min_coord = torch::nullopt,
-                                  const torch::optional<Vec3i>& grid_size = torch::nullopt) const;
+    /// @return A dense tensor of shape [B, W, H, D, *] containing the values at the specified
+    /// coordinates (and zero elsewhere)
+    torch::Tensor read_into_dense(const JaggedTensor                &sparse_data,
+                                  const torch::optional<Vec3iBatch> &min_coord = torch::nullopt,
+                                  const torch::optional<Vec3i> &grid_size = torch::nullopt) const;
 
     /// @brief Given a GridBatch and features associated with it,
     ///        return a JaggedTensor representing features for this batch of grid.
     ///        Fill any voxels not in the GridBatch with the default value.
-    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with other_grid.
+    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with
+    /// other_grid.
     /// @param other_grid A GridBatch representing the grid to fill from.
     /// @param default_value The value to fill in for voxels not in other_grid.
-    JaggedTensor fill_to_grid(const JaggedTensor& features,
-                              const GridBatch& other_grid,
+    JaggedTensor fill_to_grid(const JaggedTensor &features, const GridBatch &other_grid,
                               float default_value = 0.0f) const;
 
     /// @brief Convert grid coordinates to world coordinates
-    /// @param ijk A JaggedTensor of grid coordinates with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @return A JaggedTensor of world coordinates with shape [B, -1, 3] (one point set per grid in the batch)
-    JaggedTensor grid_to_world(const JaggedTensor& ijk) const;
+    /// @param ijk A JaggedTensor of grid coordinates with shape [B, -1, 3] (one point set per grid
+    /// in the batch)
+    /// @return A JaggedTensor of world coordinates with shape [B, -1, 3] (one point set per grid in
+    /// the batch)
+    JaggedTensor grid_to_world(const JaggedTensor &ijk) const;
 
     /// @brief Convert world coordinates to grid coordinates
-    /// @param xyz A JaggedTensor of world coordinates with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @return A JaggedTensor of grid coordinates with shape [B, -1, 3] (one point set per grid in the batch)
-    JaggedTensor world_to_grid(const JaggedTensor& xyz) const;
+    /// @param xyz A JaggedTensor of world coordinates with shape [B, -1, 3] (one point set per grid
+    /// in the batch)
+    /// @return A JaggedTensor of grid coordinates with shape [B, -1, 3] (one point set per grid in
+    /// the batch)
+    JaggedTensor world_to_grid(const JaggedTensor &xyz) const;
 
     /// @brief Get grid-to-world matrices
     /// @return A JaggedTensor of grid-to-world matrices with shape [B, 4, 4]
-    torch::Tensor grid_to_world_matrices(const torch::Dtype& dtype = torch::kFloat32) const;
+    torch::Tensor grid_to_world_matrices(const torch::Dtype &dtype = torch::kFloat32) const;
 
     /// @brief Get world-to-grid matrices
     /// @return A JaggedTensor of world-to-grid matrices with shape [B, 4, 4]
-    torch::Tensor world_to_grid_matrices(const torch::Dtype& dtype = torch::kFloat32) const;
+    torch::Tensor world_to_grid_matrices(const torch::Dtype &dtype = torch::kFloat32) const;
 
     /// @brief Sample features on the grid batch using trilinear interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1, C] or a Tensor of
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1,
+    /// C] or a Tensor of
     ///                   shape [N, C] where N is the total number of voxels in the batch
     ///                   (one item for each voxel in each grid in the batch)
     /// @return a JaggedTensor of sampled data with shape [B, -1, C] (one sample set per point)
-    JaggedTensor sample_trilinear(const JaggedTensor& points,
-                                  const JaggedTensor& voxel_data) const;
+    JaggedTensor sample_trilinear(const JaggedTensor &points, const JaggedTensor &voxel_data) const;
 
     /// @brief Sample features and spatial gradients on the grid batch using trilinear interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1, C] or a Tensor of
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1,
+    /// C] or a Tensor of
     ///                   shape [N, C] where N is the total number of voxels in the batch
     ///                   (one item for each voxel in each grid in the batch)
-    /// @return a pair (feat, grad_feat) which are JaggedTensors of sampled data with shape [B, -1, C], and [B, -1, C, 3]
-    ///         respectively where feat are the sampled features and grad_feat are the spatial gradients of the sampled
-    ///         features (one sample set per point)
-    std::vector<JaggedTensor> sample_trilinear_with_grad(const JaggedTensor& points,
-                                                         const JaggedTensor& voxel_data) const;
+    /// @return a pair (feat, grad_feat) which are JaggedTensors of sampled data with shape [B, -1,
+    /// C], and [B, -1, C, 3]
+    ///         respectively where feat are the sampled features and grad_feat are the spatial
+    ///         gradients of the sampled features (one sample set per point)
+    std::vector<JaggedTensor> sample_trilinear_with_grad(const JaggedTensor &points,
+                                                         const JaggedTensor &voxel_data) const;
 
     /// @brief Sample features on the grid batch using bezier interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1, C] or a Tensor of
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1,
+    /// C] or a Tensor of
     ///                   shape [N, C] where N is the total number of voxels in the batch
     ///                   (one item for each voxel in each grid in the batch)
     /// @return a JaggedTensor of sampled data with shape [B, -1, C] (one sample set per point)
-    JaggedTensor sample_bezier(const JaggedTensor& points,
-                               const JaggedTensor& voxel_data) const;
+    JaggedTensor sample_bezier(const JaggedTensor &points, const JaggedTensor &voxel_data) const;
 
     /// @brief Sample features and spatial gradients on the grid batch using bezier interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1, C] or a Tensor of
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param voxel_data a JaggedTensor of C-dimensional features at each voxel with shape [B, -1,
+    /// C] or a Tensor of
     ///                   shape [N, C] where N is the total number of voxels in the batch
     ///                   (one item for each voxel in each grid in the batch)
-    /// @return a pair (feat, grad_feat) which are JaggedTensors of sampled data with shape [B, -1, C], and [B, -1, C, 3]
-    ///         respectively where feat are the sampled features and grad_feat are the spatial gradients of the sampled
-    ///         features (one sample set per point)
-    std::vector<JaggedTensor> sample_bezier_with_grad(const JaggedTensor& points,
-                                                      const JaggedTensor& voxel_data) const;
+    /// @return a pair (feat, grad_feat) which are JaggedTensors of sampled data with shape [B, -1,
+    /// C], and [B, -1, C, 3]
+    ///         respectively where feat are the sampled features and grad_feat are the spatial
+    ///         gradients of the sampled features (one sample set per point)
+    std::vector<JaggedTensor> sample_bezier_with_grad(const JaggedTensor &points,
+                                                      const JaggedTensor &voxel_data) const;
 
     /// @brief Splat features at points into a grid batch using trilinear interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param points_data a JaggedTensor of C-dimensional features at each point with shape [B, -1, C]
-    /// @return a JaggedTensor of C-dimensional features at each voxel in the batch with shape [B, -1, C]
-    JaggedTensor splat_trilinear(const JaggedTensor& points,
-                                 const JaggedTensor& points_data) const;
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param points_data a JaggedTensor of C-dimensional features at each point with shape [B, -1,
+    /// C]
+    /// @return a JaggedTensor of C-dimensional features at each voxel in the batch with shape [B,
+    /// -1, C]
+    JaggedTensor splat_trilinear(const JaggedTensor &points, const JaggedTensor &points_data) const;
 
     /// @brief Splat features at points into a grid using bezier interpolation
-    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param points_data a JaggedTensor of C-dimensional features at each point with shape [B, -1, C]
-    /// @return a JaggedTensor of C-dimensional features at each voxel in the batch with shape [B, -1, C]
-    JaggedTensor splat_bezier(const JaggedTensor& points,
-                              const JaggedTensor& points_data) const;
+    /// @param points a JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param points_data a JaggedTensor of C-dimensional features at each point with shape [B, -1,
+    /// C]
+    /// @return a JaggedTensor of C-dimensional features at each voxel in the batch with shape [B,
+    /// -1, C]
+    JaggedTensor splat_bezier(const JaggedTensor &points, const JaggedTensor &points_data) const;
 
     /// @brief Get the indices of neighbors in the N-ring of each voxel in the grid batch
-    ///        (possibly bitshifting the coordinates which is useful when you use multiple grids to represent different
-    ///        levels of a hierarchy and you want to query this grid with coordinates at a finer level)
-    /// @param ijk A JaggedTensor of voxel coordinates with shape [B, -1, 3] (one set of coordinates per grid in the batch)
+    ///        (possibly bitshifting the coordinates which is useful when you use multiple grids to
+    ///        represent different levels of a hierarchy and you want to query this grid with
+    ///        coordinates at a finer level)
+    /// @param ijk A JaggedTensor of voxel coordinates with shape [B, -1, 3] (one set of coordinates
+    /// per grid in the batch)
     /// @param extent The size of a neighborhood to find indexes
     /// @param bitshift The number of bits to shift the coordinates by
-    /// @return A JaggedTensor of neighbor indexes with shape [B, -1, 2*extent+1, 2*extent+1, 2*extent+1] (-1 value indicates no neighbor at that index)
-    JaggedTensor neighbor_indexes(const JaggedTensor& ijk,
-                                  int32_t extent,
+    /// @return A JaggedTensor of neighbor indexes with shape [B, -1, 2*extent+1, 2*extent+1,
+    /// 2*extent+1] (-1 value indicates no neighbor at that index)
+    JaggedTensor neighbor_indexes(const JaggedTensor &ijk, int32_t extent,
                                   int32_t bitshift = 0) const;
 
     /// @brief Return whether each point lies inside the grid batch
-    /// @param xyz A JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
-    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to mutable grids)
+    /// @param xyz A JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the
+    /// batch)
+    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to
+    /// mutable grids)
     /// @return A JaggedTensor of booleans with shape [B, -1] (one boolean per point)
-    ///         where the [bi, i]^th entry is true if points[bi, i] lies inside the bi^th grid in the batch
-    JaggedTensor points_in_active_voxel(const JaggedTensor& xyz,
-                                        bool ignore_disabled = false) const;
+    ///         where the [bi, i]^th entry is true if points[bi, i] lies inside the bi^th grid in
+    ///         the batch
+    JaggedTensor points_in_active_voxel(const JaggedTensor &xyz,
+                                        bool                ignore_disabled = false) const;
 
-    /// @brief Return whether the cube with corners at cube_min and cube_max centered at each point in world space
+    /// @brief Return whether the cube with corners at cube_min and cube_max centered at each point
+    /// in world space
     ///        intersect the grid batch
-    /// @param cube_centers A JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
+    /// @param cube_centers A JaggedTensor of points with shape [B, -1, 3] (one point set per grid
+    /// in the batch)
     /// @param cube_min A 3D tensor specifying the min corner relative to each point to check
     /// @param cube_max A 3D tensor specifying the max corner relative to each point to check
-    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to mutable grids)
+    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to
+    /// mutable grids)
     /// @return A JaggedTensor of booleans with shape [B, -1] (one boolean per point)
-    ///         where the [bi, i]^th entry is true if the cube with extent (min, max) + points[bi, i] intersects
-    ///         the bi^th grid in the batch
-    JaggedTensor cubes_intersect_grid(const JaggedTensor& cube_centers,
-                                      const Vec3dOrScalar& cube_min = 0.0,
-                                      const Vec3dOrScalar& cube_max = 0.0,
-                                      bool ignore_disabled = false) const;
-
-    /// @brief Return whether the cube with corners at cube_min and cube_max centered at each point in world space
+    ///         where the [bi, i]^th entry is true if the cube with extent (min, max) + points[bi,
+    ///         i] intersects the bi^th grid in the batch
+    JaggedTensor cubes_intersect_grid(const JaggedTensor  &cube_centers,
+                                      const Vec3dOrScalar &cube_min        = 0.0,
+                                      const Vec3dOrScalar &cube_max        = 0.0,
+                                      bool                 ignore_disabled = false) const;
+
+    /// @brief Return whether the cube with corners at cube_min and cube_max centered at each point
+    /// in world space
     ///        is fully contained in the grid batch's stencil
-    /// @param cube_centers A JaggedTensor of points with shape [B, -1, 3] (one point set per grid in the batch)
+    /// @param cube_centers A JaggedTensor of points with shape [B, -1, 3] (one point set per grid
+    /// in the batch)
     /// @param cube_min A 3D tensor specifying the min corner relative to each point to check
     /// @param cube_max A 3D tensor specifying the max corner relative to each point to check
-    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to mutable grids)
+    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to
+    /// mutable grids)
     /// @return A JaggedTensor of booleans with shape [B, -1] (one boolean per point)
-    ///         where the [bi, i]^th entry is true if the cube with extent (min, max) + points[bi, i] lies
-    ///         inside the bi^th grid in the batch
-    JaggedTensor cubes_in_grid(const JaggedTensor& cube_centers,
-                               const Vec3dOrScalar& cube_min = 0.0,
-                               const Vec3dOrScalar& cube_max = 0.0,
-                               bool ignore_disabled = false) const;
+    ///         where the [bi, i]^th entry is true if the cube with extent (min, max) + points[bi,
+    ///         i] lies inside the bi^th grid in the batch
+    JaggedTensor cubes_in_grid(const JaggedTensor  &cube_centers,
+                               const Vec3dOrScalar &cube_min        = 0.0,
+                               const Vec3dOrScalar &cube_max        = 0.0,
+                               bool                 ignore_disabled = false) const;
 
     /// @brief Return a boolean mask indicating whether each voxel in the grid is enabled or not
-    /// @return A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is enabled or not
+    /// @return A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is
+    /// enabled or not
     JaggedTensor enabled_mask() const;
 
     /// @brief Return a boolean mask indicating whether each voxel in the grid is disabled or not
-    /// @return A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is disabled or not
+    /// @return A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is
+    /// disabled or not
     JaggedTensor disabled_mask() const;
 
     /// @brief Return whether each coordinate is in the grid batch or not
     /// @param ijk A JaggedTensor of ijk coordinates with lshape [N_0, ..., N_B] and eshape (3,)
     ///            (one coordinate set per grid in the batch)
-    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to mutable grids)
+    /// @param ignore_disabled Whether to ignore voxels that have been disabled (only applicable to
+    /// mutable grids)
     /// @return A JaggedTensor of booleans with shape [B, -1] (one boolean per coordinate)
-    ///         where the [bi, i]^th entry is true if coords[bi, i] lies inside the bi^th grid in the batch
-    JaggedTensor coords_in_active_voxel(const JaggedTensor& ijk, bool ignore_disabled = false) const;
+    ///         where the [bi, i]^th entry is true if coords[bi, i] lies inside the bi^th grid in
+    ///         the batch
+    JaggedTensor coords_in_active_voxel(const JaggedTensor &ijk,
+                                        bool                ignore_disabled = false) const;
 
     /// @brief Return the integer offset of each ijk value in the grid batch
-    /// @param ijk A JaggedTensor of ijk coordinates with shape [B, -1, 3] (one coordinate set per grid in the batch)
-    /// @param cumulative Whether to return cumulative offsets in the batch or offsets relative to each grid
-    /// @return A JaggedTensor of integer offsets with shape [B, -1] into the grid batch (one offset per coordinate)
-    JaggedTensor ijk_to_index(const JaggedTensor& ijk, bool cumulative = false) const;
-
-    /// @brief Return a JaggedTensor of integers such that if it is used as a permutation of the input IJK coordinates,
-    ///        it will re-order them to the indexing order of the grid batch. This effectively performs the inverse of
-    ///        ijk_to_index if you pass in the ijk coordinates in the grid.
+    /// @param ijk A JaggedTensor of ijk coordinates with shape [B, -1, 3] (one coordinate set per
+    /// grid in the batch)
+    /// @param cumulative Whether to return cumulative offsets in the batch or offsets relative to
+    /// each grid
+    /// @return A JaggedTensor of integer offsets with shape [B, -1] into the grid batch (one offset
+    /// per coordinate)
+    JaggedTensor ijk_to_index(const JaggedTensor &ijk, bool cumulative = false) const;
+
+    /// @brief Return a JaggedTensor of integers such that if it is used as a permutation of the
+    /// input IJK coordinates,
+    ///        it will re-order them to the indexing order of the grid batch. This effectively
+    ///        performs the inverse of ijk_to_index if you pass in the ijk coordinates in the grid.
     ///        i.e. output[ijk_to_index(ijk[i])] = i
     /// @param ijk A JaggedTensor of ijk coordinates with lshape [N_0, ..., N_B] and eshape (3,)
     ///            (one coordinate set per grid in the batch)
-    /// @param cumulative Whether to return cumulative offsets in the batch or offsets relative to each grid
-    /// @return A JaggedTensor of integers with shape [B, -1] (one integer per grids' ijk) which inverts ijkToIndex
-    JaggedTensor ijk_to_inv_index(const JaggedTensor& ijk, bool cumulative = false) const;
+    /// @param cumulative Whether to return cumulative offsets in the batch or offsets relative to
+    /// each grid
+    /// @return A JaggedTensor of integers with shape [B, -1] (one integer per grids' ijk) which
+    /// inverts ijkToIndex
+    JaggedTensor ijk_to_inv_index(const JaggedTensor &ijk, bool cumulative = false) const;
 
     /// @brief Return the set of active ijk coordinates indexed by this grid batch
     /// @return A JaggedTensor of voxel coordinates indexed by this grid batch (shape [B, -1, 3])
@@ -496,35 +592,48 @@ struct GridBatch : torch::CustomClassHolder {
     /// @return A JaggedTensor of voxel coordinates indexed by this grid batch (shape [B, -1, 3])
     JaggedTensor ijk_enabled() const;
 
-    /// @brief Find the intersection between a collection of rays and the zero level set of a scalar field
+    /// @brief Find the intersection between a collection of rays and the zero level set of a scalar
+    /// field
     ///        at each voxel in the grid batch
-    /// @param ray_origins A JaggedTensor of ray origins with shape [B, -1, 3] (one ray set per grid in the batch)
-    /// @param ray_directions A JaggedTensor of ray directions with shape [B, -1, 3] (one ray set per grid in the batch)
-    /// @param grid_scalars A JaggedTensor of scalar values with shape [B, -1] (one scalar per voxel in the batch)
+    /// @param ray_origins A JaggedTensor of ray origins with shape [B, -1, 3] (one ray set per grid
+    /// in the batch)
+    /// @param ray_directions A JaggedTensor of ray directions with shape [B, -1, 3] (one ray set
+    /// per grid in the batch)
+    /// @param grid_scalars A JaggedTensor of scalar values with shape [B, -1] (one scalar per voxel
+    /// in the batch)
     /// @param eps Skip voxels where the ray intersects by less than this distance
     /// @return A JaggedTensor of intersection times with shape [B, -1] (one time per ray)
-    JaggedTensor ray_implicit_intersection(const JaggedTensor& ray_origins,
-                                           const JaggedTensor& ray_directions,
-                                           const JaggedTensor& grid_scalars,
-                                           double eps = 0.0) const;
+    JaggedTensor ray_implicit_intersection(const JaggedTensor &ray_origins,
+                                           const JaggedTensor &ray_directions,
+                                           const JaggedTensor &grid_scalars,
+                                           double              eps = 0.0) const;
 
-    /// @brief Enumerate the voxels in this grid batch (in-sorted order) intersected by a collection of rays
+    /// @brief Enumerate the voxels in this grid batch (in-sorted order) intersected by a collection
+    /// of rays
     /// @param ray_origins A JaggedTensor of ray origins with lshape [N_0, ..., N_B] and eshape [3,]
     ///                    where N_i is the number of rays to intersect with the i^th grid
-    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and eshape [3,]
+    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and
+    /// eshape [3,]
     ///                       where N_i is the number of rays to intersect with the i^th grid
     /// @param max_voxels The maximum number of voxels to return per ray
     /// @param eps Skip voxels where the ray intersects by less than this distance
-    /// @param return_ijk Whether to return the voxel coordinates in the grid or world coordinates or the voxel index
-    /// @param cumulative Whether to return cumulative indices in the batch or indices relative to each grid
+    /// @param return_ijk Whether to return the voxel coordinates in the grid or world coordinates
+    /// or the voxel index
+    /// @param cumulative Whether to return cumulative indices in the batch or indices relative to
+    /// each grid
     ///                   (only applicable to return_ijk = false, otherwise ignored)
-    /// @return A pair of JaggedTensors containing the voxels (or voxel indices) intersected by the rays. i.e.:
-    ///             - voxels: A JaggedTensor with lshape [[V_{0,0}, ..., V_{0,N_0}], ..., [V_{B,0}, ..., V_{B,N_B}]]
-    ///                       and eshape (3,) or (,) containing the ijk coordinates or indices of the voxels
-    ///             - times: A JaggedTensor with lshape [[T_{0,0}, ..., T_{0,N_0}], ..., [T_{B,0}, ..., T_{B,N_B}]]
-    ///                      and eshape (2,) containg the entry and exit distance along the ray of each voxel
-    std::vector<JaggedTensor> voxels_along_rays(const JaggedTensor& ray_origins,
-                                                const JaggedTensor& ray_directions,
+    /// @return A pair of JaggedTensors containing the voxels (or voxel indices) intersected by the
+    /// rays. i.e.:
+    ///             - voxels: A JaggedTensor with lshape [[V_{0,0}, ..., V_{0,N_0}], ..., [V_{B,0},
+    ///             ..., V_{B,N_B}]]
+    ///                       and eshape (3,) or (,) containing the ijk coordinates or indices of
+    ///                       the voxels
+    ///             - times: A JaggedTensor with lshape [[T_{0,0}, ..., T_{0,N_0}], ..., [T_{B,0},
+    ///             ..., T_{B,N_B}]]
+    ///                      and eshape (2,) containg the entry and exit distance along the ray of
+    ///                      each voxel
+    std::vector<JaggedTensor> voxels_along_rays(const JaggedTensor &ray_origins,
+                                                const JaggedTensor &ray_directions,
                                                 int64_t max_voxels, double eps = 0.0,
                                                 bool return_ijk = true,
                                                 bool cumulative = false) const;
@@ -533,90 +642,99 @@ struct GridBatch : torch::CustomClassHolder {
     ///        grid batch (in-sorted order) intersected by a collection of rays
     /// @param ray_origins A JaggedTensor of ray origins with lshape [N_0, ..., N_B] and eshape [3,]
     ///                    where N_i is the number of rays to intersect with the i^th grid
-    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and eshape [3,]
+    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and
+    /// eshape [3,]
     ///                       where N_i is the number of rays to intersect with the i^th grid
     /// @param max_segments The maximum number of segments to return per ray
     /// @param eps Skip segments whose length is less than this distance
     /// @param ignore_masked If set to true, will treat masked voxels as active
     /// @return A JaggedTensor containing the segments intersected by the rays. i.e. a JaggedTensor
     ///         with lshape [[S_{0,0}, ..., S_{0,N_0}], ..., [S_{B,0}, ..., S_{B,N_B}]]
-    JaggedTensor segments_along_rays(const JaggedTensor& ray_origins,
-                                     const JaggedTensor& ray_directions,
-                                     int64_t max_segments, double eps = 0.0, bool ignore_masked = false) const;
+    JaggedTensor segments_along_rays(const JaggedTensor &ray_origins,
+                                     const JaggedTensor &ray_directions, int64_t max_segments,
+                                     double eps = 0.0, bool ignore_masked = false) const;
 
     /// @brief Generate a set of uniform samples in active regions along a specified set of rays
     /// @param ray_origins A JaggedTensor of ray origins with lshape [N_0, ..., N_B] and eshape [3,]
     ///                    where N_i is the number of rays to intersect with the i^th grid
-    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and eshape [3,]
+    /// @param ray_directions A JaggedTensor of ray directions with lshape [N_0, ..., N_B] and
+    /// eshape [3,]
     ///                       where N_i is the number of rays to intersect with the i^th grid
     /// @param t_min The start distance along each ray to begin generating samples
     /// @param t_max The end distance along each ray to stop generating samples
     /// @param step_size The distance between samples along each ray
     /// @param cone_angle A cone angle for each ray used to space samples along the ray
     /// @param include_end_segments Whether to include the end segments of the rays in the samples
-    /// @param return_midpoints Whether to return the midpoint of each sample instead of the start and end
+    /// @param return_midpoints Whether to return the midpoint of each sample instead of the start
+    /// and end
     /// @param eps Skip segments whose length is less than this distance
     /// @return A JaggedTensor containing the samples along the rays. i.e. a JaggedTensor
-    ///         with lshape [[S_{0,0}, ..., S_{0,N_0}], ..., [S_{B,0}, ..., S_{B,N_B}]] and eshape (2,) or (1,)
-    ///         representing the start and end distance of each sample or the midpoint of each sample
-    ///         if return_midpoints is true
-    JaggedTensor uniform_ray_samples(const JaggedTensor& ray_origins,
-                                     const JaggedTensor& ray_directions,
-                                     const JaggedTensor& t_min,
-                                     const JaggedTensor& t_max,
-                                     double step_size,
-                                     double cone_angle = 0.0,
-                                     bool include_end_segments = true,
-                                     bool return_midpoints = false,
-                                     double eps = 0.0) const;
+    ///         with lshape [[S_{0,0}, ..., S_{0,N_0}], ..., [S_{B,0}, ..., S_{B,N_B}]] and eshape
+    ///         (2,) or (1,) representing the start and end distance of each sample or the midpoint
+    ///         of each sample if return_midpoints is true
+    JaggedTensor uniform_ray_samples(const JaggedTensor &ray_origins,
+                                     const JaggedTensor &ray_directions, const JaggedTensor &t_min,
+                                     const JaggedTensor &t_max, double step_size,
+                                     double cone_angle = 0.0, bool include_end_segments = true,
+                                     bool return_midpoints = false, double eps = 0.0) const;
 
     /// @brief Return an edge network used which can be used to plot the grids in this batch
-    /// @param return_voxel_coordinates Whether to return the vertices in voxel coordinates or world coordinates
-    /// @return A pair (verts, edges) where verts is a JaggedTensor of vertex positions with shape [B, -1, 3]
-    ///         (one vertex set per grid in the batch) and edges is a JaggedTensor of edge indices of
-    ///         shape [B, -1, 2] (one edge set per grid in the batch)
+    /// @param return_voxel_coordinates Whether to return the vertices in voxel coordinates or world
+    /// coordinates
+    /// @return A pair (verts, edges) where verts is a JaggedTensor of vertex positions with shape
+    /// [B, -1, 3]
+    ///         (one vertex set per grid in the batch) and edges is a JaggedTensor of edge indices
+    ///         of shape [B, -1, 2] (one edge set per grid in the batch)
     std::vector<JaggedTensor> viz_edge_network(bool return_voxel_coordinates = false) const;
 
-    /// @brief Disable the specified voxels in the grid batch. If the input ijk values refer to non-indexed voxels,
+    /// @brief Disable the specified voxels in the grid batch. If the input ijk values refer to
+    /// non-indexed voxels,
     ///        then these are simply ignored.
-    /// @param ijk A Jagged tensor of shape [B, -1, 3] of coordinates to disable(one set of coordinates per grid in the batch)
+    /// @param ijk A Jagged tensor of shape [B, -1, 3] of coordinates to disable(one set of
+    /// coordinates per grid in the batch)
     /// @note This is only applicable to mutable grids
-    void disable_ijk(const JaggedTensor& ijk);
+    void disable_ijk(const JaggedTensor &ijk);
 
-    /// @brief Enable the specified voxels in the grid batch. If the input ijk values refer to non-indexed voxels,
+    /// @brief Enable the specified voxels in the grid batch. If the input ijk values refer to
+    /// non-indexed voxels,
     ///        then these are simply ignored.
-    /// @param ijk A Jagged tensor of shape [B, -1, 3] of coordinates to enable (one set of coordinates per grid in the batch)
+    /// @param ijk A Jagged tensor of shape [B, -1, 3] of coordinates to enable (one set of
+    /// coordinates per grid in the batch)
     /// @note This is only applicable to mutable grids
-    void enable_ijk(const JaggedTensor& ijk);
-
-    /// @brief Return a batch of grids representing the dual of this batch. i.e. The centers of the dual grid correspond
-    ///        to the corners of this grid batch. The [i, j, k] coordinate of the dual grid corresponds to the bottom/left/back
-    ///        corner of the [i, j, k] voxel in this grid batch.
-    /// @param exclude_border Whether to exclude the border of the grid batch when computing the dual grid
+    void enable_ijk(const JaggedTensor &ijk);
+
+    /// @brief Return a batch of grids representing the dual of this batch. i.e. The centers of the
+    /// dual grid correspond
+    ///        to the corners of this grid batch. The [i, j, k] coordinate of the dual grid
+    ///        corresponds to the bottom/left/back corner of the [i, j, k] voxel in this grid batch.
+    /// @param exclude_border Whether to exclude the border of the grid batch when computing the
+    /// dual grid
     /// @return A GridBatch representing the dual of this grid batch
     GridBatch dual_grid(bool exclude_border = false) const;
 
     /// @brief Return a batch of grids representing the coarsened version of this batch.
-    ///        Each voxel [i, j, k] in this grid batch maps to voxel [i / branchFactor, j / branchFactor, k / branchFactor]
-    ///        in the coarse batch.
-    /// @param coarsening_factor The factor by which to coarsen the grid batch (i.e (2, 2, 2) coarses by a factor of 2x2x2)
+    ///        Each voxel [i, j, k] in this grid batch maps to voxel [i / branchFactor, j /
+    ///        branchFactor, k / branchFactor] in the coarse batch.
+    /// @param coarsening_factor The factor by which to coarsen the grid batch (i.e (2, 2, 2)
+    /// coarses by a factor of 2x2x2)
     /// @return A GridBatch representing the coarsened version of this batch.
     GridBatch coarsened_grid(Vec3iOrScalar coarsening_factor) const;
 
     /// @brief Subdivide the grid batch into a finer grid batch.
-    ///        Each voxel [i, j, k] in this grid batch maps to voxels [i * subdivFactor, j * subdivFactor, k * subdivFactor]
-    ///        in the fine batch.
+    ///        Each voxel [i, j, k] in this grid batch maps to voxels [i * subdivFactor, j *
+    ///        subdivFactor, k * subdivFactor] in the fine batch.
     /// @param subdiv_factor The factor by which to subdivide the grid batch
-    /// @param mask An optional JaggedTensor of shape [B, -1] of boolean values indicating which voxels to subdivide
+    /// @param mask An optional JaggedTensor of shape [B, -1] of boolean values indicating which
+    /// voxels to subdivide
     /// @return A GridBatch representing the subdivided version of this batch.
-    GridBatch subdivided_grid(Vec3iOrScalar subdiv_factor,
+    GridBatch subdivided_grid(Vec3iOrScalar                       subdiv_factor,
                               const torch::optional<JaggedTensor> mask = torch::nullopt) const;
 
     /// @brief Return a batch of grids representing the clipped version of this batch of grids.
     /// @param ijk_min Index space minimum bound of the clip region.
     /// @param ijk_max Index space maximum bound of the clip region.
     /// @return A GridBatch representing the clipped version of this batch of grids.
-    GridBatch clipped_grid(const Vec3iBatch& ijk_min, const Vec3iBatch& ijk_max) const;
+    GridBatch clipped_grid(const Vec3iBatch &ijk_min, const Vec3iBatch &ijk_max) const;
 
     /// @brief Generate the grid that is affected by the convolution operator.
     /// @param kernel_size The kernel size of convolution
@@ -624,31 +742,41 @@ struct GridBatch : torch::CustomClassHolder {
     /// @return A GridBatch representing the convolved grid.
     GridBatch conv_grid(Vec3iOrScalar kernel_size, Vec3iOrScalar stride) const;
 
-    /// @brief Return a batch of grids representing the clipped version of this batch of grids and corresponding features.
-    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with this batch of grids.
+    /// @brief Return a batch of grids representing the clipped version of this batch of grids and
+    /// corresponding features.
+    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with this
+    /// batch of grids.
     /// @param ijk_min Index space minimum bound of the clip region.
     /// @param ijk_max Index space maximum bound of the clip region.
-    /// @return A pair (clipped_features, clipped_grid) where clipped_features is a JaggedTensor of shape [B, -1, *] and
+    /// @return A pair (clipped_features, clipped_grid) where clipped_features is a JaggedTensor of
+    /// shape [B, -1, *] and
     ///         clipped_grid is a GridBatch representing the clipped version of this batch of grids.
-    std::pair<JaggedTensor, GridBatch> clip(const JaggedTensor& features, const Vec3iBatch& ijk_min,  const Vec3iBatch& ijk_max) const;
+    std::pair<JaggedTensor, GridBatch> clip(const JaggedTensor &features, const Vec3iBatch &ijk_min,
+                                            const Vec3iBatch &ijk_max) const;
 
     /// @brief Extract 0-isosurface from an implicit field.
     /// @param field implicit value stored on each voxel center (or voxel corner on a dual grid)
     /// @param level level set of the surface to extract
     /// @return vertices and faces arrays of the extracted isosurface
-    std::vector<JaggedTensor> marching_cubes(const JaggedTensor& field, double level = 0.0) const;
+    std::vector<JaggedTensor> marching_cubes(const JaggedTensor &field, double level = 0.0) const;
 
-    /// @brief Perform in-grid convolution using fast halo buffer method. Currently only supports kernel_size = 3.
-    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with this batch of grids.
+    /// @brief Perform in-grid convolution using fast halo buffer method. Currently only supports
+    /// kernel_size = 3.
+    /// @param features A JaggedTensor of shape [B, -1, *] containing features associated with this
+    /// batch of grids.
     /// @param kernel A tensor of shape [Out, In, 3, 3, 3] containing the kernel to convolve with.
     /// @return A JaggedTensor of shape [B, -1, *] containing the convolved features.
-    JaggedTensor sparse_conv_halo(const JaggedTensor& features, const torch::Tensor& kernel, int variant) const;
+    JaggedTensor sparse_conv_halo(const JaggedTensor &features, const torch::Tensor &kernel,
+                                  int variant) const;
 
-    /// @brief Return a grid batch on the specified device. If the passed in device is the same as this grid batch's
-    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is returned on the specified device.
+    /// @brief Return a grid batch on the specified device. If the passed in device is the same as
+    /// this grid batch's
+    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is
+    ///        returned on the specified device.
     /// @param to_device The device to return the grid batch on
     /// @return A GridBatch representing this grid batch on the specified device
-    GridBatch to(TorchDeviceOrString to_device) const {
+    GridBatch
+    to(TorchDeviceOrString to_device) const {
         torch::Device toDevice = to_device.value();
         if (toDevice == device()) {
             return GridBatch(impl());
@@ -657,34 +785,45 @@ struct GridBatch : torch::CustomClassHolder {
         }
     }
 
-    /// @brief Return a grid batch on the same device as the specified grid batch. If the passed in grid has the same device as this grid batch's
-    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is returned on the specified device.
+    /// @brief Return a grid batch on the same device as the specified grid batch. If the passed in
+    /// grid has the same device as this grid batch's
+    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is
+    ///        returned on the specified device.
     /// @param to_grid The grid batch used to specify which device to return the grid batch on
     /// @return A GridBatch representing this grid batch on the specified device
-    GridBatch to(const GridBatch& to_grid) const {
+    GridBatch
+    to(const GridBatch &to_grid) const {
         return this->to(to_grid.device());
     }
 
-    /// @brief Return a grid batch on the same device as the specified tensor. If the passed in tensor has the same device as this grid batch's
-    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is returned on the specified device.
+    /// @brief Return a grid batch on the same device as the specified tensor. If the passed in
+    /// tensor has the same device as this grid batch's
+    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is
+    ///        returned on the specified device.
     /// @param to_tensor The tensor used to specify which device to return the grid batch on
     /// @return A GridBatch representing this grid batch on the specified device
-    GridBatch to(const torch::Tensor& to_tensor) const {
+    GridBatch
+    to(const torch::Tensor &to_tensor) const {
         return this->to(to_tensor.device());
     }
 
-    /// @brief Return a grid batch on the same device as the specified JaggedTensor. If the passed in JaggedTensor has the same device as this grid batch's
-    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is returned on the specified device.
+    /// @brief Return a grid batch on the same device as the specified JaggedTensor. If the passed
+    /// in JaggedTensor has the same device as this grid batch's
+    ///        device, then this grid batch is returned. Otherwise, a copy of this grid batch is
+    ///        returned on the specified device.
     /// @param to_jtensor The JaggedTensor used to specify which device to return the grid batch on
     /// @return A GridBatch representing this grid batch on the specified device
-    GridBatch to(const JaggedTensor& to_jtensor) const {
+    GridBatch
+    to(const JaggedTensor &to_jtensor) const {
         return this->to(to_jtensor.device());
     }
 
-    /// @brief Return a view of this grid batch containing the grid at the specified index i.e. grid_batch[bi]
+    /// @brief Return a view of this grid batch containing the grid at the specified index i.e.
+    /// grid_batch[bi]
     /// @param bi The index to get a view on
     /// @return A GridBatch representing the grid at the specified index
-    GridBatch index(int64_t bi) const {
+    GridBatch
+    index(int64_t bi) const {
         return GridBatch(impl()->index(bi));
     }
 
@@ -693,146 +832,182 @@ struct GridBatch : torch::CustomClassHolder {
     /// @param stop The stop index of the slice
     /// @param step The step of the slice
     /// @return A GridBatch representing the slice of this grid batch
-    GridBatch index(size_t start, size_t stop, size_t step) const {
+    GridBatch
+    index(size_t start, size_t stop, size_t step) const {
         return GridBatch(impl()->index(start, stop, step));
     }
 
-    /// @brief Return a view of this grid batch at the specified indices i.e. grid_batch[[i1, i2, ...]]
+    /// @brief Return a view of this grid batch at the specified indices i.e. grid_batch[[i1, i2,
+    /// ...]]
     /// @param bi A list of integers representing the indices to get a view on
     /// @return The grid batch vieweed at the specified indices
-    GridBatch index(const std::vector<int64_t>& bi) const {
+    GridBatch
+    index(const std::vector<int64_t> &bi) const {
         return GridBatch(impl()->index(bi));
     }
 
-    /// @brief Return a view of this grid batch at indices specified by the given mask i.e. grid_batch[mask]
+    /// @brief Return a view of this grid batch at indices specified by the given mask i.e.
+    /// grid_batch[mask]
     /// @param bi A list of integers representing the indices to get a view on
     /// @return The grid batch vieweed at the specified indices
-    GridBatch index(const std::vector<bool>& bi) const {
+    GridBatch
+    index(const std::vector<bool> &bi) const {
         return GridBatch(impl()->index(bi));
     }
 
-    /// @brief Return a view of this grid batch at the specified indices (or mask if bi is a bool tensor) i.e. grid_batch[[i1, i2, ...]]
+    /// @brief Return a view of this grid batch at the specified indices (or mask if bi is a bool
+    /// tensor) i.e. grid_batch[[i1, i2, ...]]
     /// @param bi A list of integers representing the indices to get a view on
     /// @return The grid batch vieweed at the specified indices
-    GridBatch index(const torch::Tensor& bi) const {
+    GridBatch
+    index(const torch::Tensor &bi) const {
         return GridBatch(impl()->index(bi));
     }
 
     /// @brief Return a JaggedTensor whose joffsets and jidx match this grid batch's
-    /// @param data The data to use for the JaggedTensor (first dimension must match the total number of voxels in the grid batch)
-    /// @param ignore_disabled If true, then voxels which are disabled will be included in the returned JaggedTensor
+    /// @param data The data to use for the JaggedTensor (first dimension must match the total
+    /// number of voxels in the grid batch)
+    /// @param ignore_disabled If true, then voxels which are disabled will be included in the
+    /// returned JaggedTensor
     /// @return A JaggedTensor corresponding to the voxel grid of this grid batch
-    JaggedTensor jagged_like(const torch::Tensor& data, bool ignore_disabled = true) const {
+    JaggedTensor
+    jagged_like(const torch::Tensor &data, bool ignore_disabled = true) const {
         return impl()->jaggedTensor(data, ignore_disabled);
     }
 
     /// @brief Populate the grid batch with voxels that intersect a triangle mesh
-    /// @param vertices A JaggedTensor of shape [B, -1, 3] containing one vertex set per grid to create
+    /// @param vertices A JaggedTensor of shape [B, -1, 3] containing one vertex set per grid to
+    /// create
     /// @param faces  A JaggedTensor of shape [B, -1, 3] containing one face set per grid to create
-    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid
+    /// in the batch or one voxel size for all grids
+    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the
+    /// [0, 0, 0] voxel
     ///                for each grid in the batch, or one origin for all grids
-    void set_from_mesh(const JaggedTensor& vertices,
-                       const JaggedTensor& faces,
-                       const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                       const Vec3dBatch& origins = torch::zeros(3, torch::kInt32));
+    void set_from_mesh(const JaggedTensor &vertices, const JaggedTensor &faces,
+                       const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                       const Vec3dBatch         &origins     = torch::zeros(3, torch::kInt32));
 
-    /// @brief Populate the grid batch with voxels which contain a point in an input set of point clouds
+    /// @brief Populate the grid batch with voxels which contain a point in an input set of point
+    /// clouds
     ///        (possibly padding each voxel containing a point)
-    /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to create
-    /// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the left/back/bottom
-    /// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the right/front/top
-    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+    /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to
+    /// create
+    /// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted
+    /// voxel with to the left/back/bottom
+    /// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted
+    /// voxel with to the right/front/top
+    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid
+    /// in the batch or one voxel size for all grids
+    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the
+    /// [0, 0, 0] voxel
     ///                for each grid in the batch, or one origin for all grids
     /// @param isMutable Whether the grid should be mutable or not
-    void set_from_points(const JaggedTensor& points,
-                         const Vec3i& pad_min = torch::zeros(3, torch::kInt32),
-                         const Vec3i& pad_max = torch::zeros(3, torch::kInt32),
-                         const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                         const Vec3dBatch& origins = torch::zeros(3, torch::kInt32));
-
-    /// @brief Populate the grid batch with the eight nearest voxels to each point in an input set of point clouds
-    /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to create
-    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+    void set_from_points(const JaggedTensor       &points,
+                         const Vec3i              &pad_min     = torch::zeros(3, torch::kInt32),
+                         const Vec3i              &pad_max     = torch::zeros(3, torch::kInt32),
+                         const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                         const Vec3dBatch         &origins     = torch::zeros(3, torch::kInt32));
+
+    /// @brief Populate the grid batch with the eight nearest voxels to each point in an input set
+    /// of point clouds
+    /// @param points A JaggedTensor with shape [B, -1, 3] containing one point set per grid to
+    /// create
+    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid
+    /// in the batch or one voxel size for all grids
+    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the
+    /// [0, 0, 0] voxel
     ///                for each grid in the batch, or one origin for all grids
     /// @param isMutable Whether the grid should be mutable or not
-    void set_from_nearest_voxels_to_points(const JaggedTensor& points,
-                                           const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                                           const Vec3dBatch& origins = torch::zeros(3, torch::kInt32));
-
+    void set_from_nearest_voxels_to_points(const JaggedTensor       &points,
+                                           const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                                           const Vec3dBatch         &origins     = torch::zeros(3,
+                                                                                                torch::kInt32));
 
     /// @brief Populate the grid batch with the specified voxel coordinates (possibly with padding)
-    /// @param ijk A JaggedTensor of shape [B, -1, 3] specifying the coordinates of each voxel to insert
-    /// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the left/back/bottom
-    /// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted voxel with to the right/front/top
-    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+    /// @param ijk A JaggedTensor of shape [B, -1, 3] specifying the coordinates of each voxel to
+    /// insert
+    /// @param pad_min A tensor of shape [3,] containing the number of voxels to pad each inserted
+    /// voxel with to the left/back/bottom
+    /// @param pad_max A tensor of shape [3,] containing the number of voxels to pad each inserted
+    /// voxel with to the right/front/top
+    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid
+    /// in the batch or one voxel size for all grids
+    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the
+    /// [0, 0, 0] voxel
     ///                for each grid in the batch, or one origin for all grids
-    void set_from_ijk(const JaggedTensor& ijk,
-                      const Vec3i& pad_min = torch::zeros(3, torch::kInt32),
-                      const Vec3i& pad_max = torch::zeros(3, torch::kInt32),
-                      const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                      const Vec3dBatch& origins = torch::zeros(3, torch::kInt32));
+    void set_from_ijk(const JaggedTensor       &ijk,
+                      const Vec3i              &pad_min     = torch::zeros(3, torch::kInt32),
+                      const Vec3i              &pad_max     = torch::zeros(3, torch::kInt32),
+                      const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                      const Vec3dBatch         &origins     = torch::zeros(3, torch::kInt32));
 
     /// @brief Populate the grid batch densely from ijk_min to ijk_min + size
     /// @param num_grids The number of grids to create in the batch
     /// @param dense_dims The size of each dense grid (shape [3,] = [W, H, D])
     /// @param ijk_min The minimum ijk coordinate of each dense grid in the batch (shape [3,])
-    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid in the batch or one voxel size for all grids
-    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the [0, 0, 0] voxel
+    /// @param voxel_sizes A tensor of shape [B, 3] or [3,] containing the voxel size of each grid
+    /// in the batch or one voxel size for all grids
+    /// @param origins A tensor of shape [B, 3] or [3,] containing the world space coordinate of the
+    /// [0, 0, 0] voxel
     ///                for each grid in the batch, or one origin for all grids
-    /// @param mask Optional mask of shape [W, H, D] to specify voxels which are included in the dense grid.
+    /// @param mask Optional mask of shape [W, H, D] to specify voxels which are included in the
+    /// dense grid.
     ///             Note that the same mask will be re-used for all the grids in the batch.
-    void set_from_dense_grid(const int64_t num_grids,
-                             const Vec3i& dense_dims,
-                             const Vec3i& ijk_min = torch::zeros(3, torch::kInt32),
-                             const Vec3dBatchOrScalar& voxel_sizes = 1.0,
-                             const Vec3dBatch& origins = torch::zeros(3),
-                             torch::optional<torch::Tensor> mask = torch::nullopt);
+    void set_from_dense_grid(const int64_t num_grids, const Vec3i &dense_dims,
+                             const Vec3i              &ijk_min     = torch::zeros(3, torch::kInt32),
+                             const Vec3dBatchOrScalar &voxel_sizes = 1.0,
+                             const Vec3dBatch         &origins     = torch::zeros(3),
+                             torch::optional<torch::Tensor> mask   = torch::nullopt);
 
     /// @brief Serialize this grid batch to a torch tensor of bytes (dtype = int8)
     /// @return A serialized grid batch encoded as a torch::Tensor of type int8
-    torch::Tensor serialize() const {
+    torch::Tensor
+    serialize() const {
         return impl()->serialize();
     }
 
     /// @brief Deserialize an int8 tensor (returned by serialize()) into a grid batch
     /// @param data A tensor enccoding a serialized grid batch as an int8 tensor
     /// @return The deserializes grid batch
-    static GridBatch deserialize(const torch::Tensor& data) {
+    static GridBatch
+    deserialize(const torch::Tensor &data) {
         return GridBatch(detail::GridBatchImpl::deserialize(data));
     }
 
     /// @brief Return an integer representing the actual data
     /// @return the value
-    int64_t address() const {
+    int64_t
+    address() const {
         return reinterpret_cast<int64_t>(impl().get());
     }
 
     /// @brief Get the underlying nanovdb::GridHandle for the grid batch
     /// @return The underlying nanovdb::GridHandle for the grid batch
-    const nanovdb::GridHandle<detail::TorchDeviceBuffer>& nanovdb_grid_handle() const {
+    const nanovdb::GridHandle<detail::TorchDeviceBuffer> &
+    nanovdb_grid_handle() const {
         return impl()->nanoGridHandle();
     }
 
-    inline const c10::intrusive_ptr<detail::GridBatchImpl> impl() const {
+    inline const c10::intrusive_ptr<detail::GridBatchImpl>
+    impl() const {
         return mImpl;
     }
 
-private:
-
-    void buildCoarseFromFineGrid(const GridBatch& fineGrid, nanovdb::Coord branchFactor);
+  private:
+    void buildCoarseFromFineGrid(const GridBatch &fineGrid, nanovdb::Coord branchFactor);
 
-    void buildFineFromCoarseGrid(const GridBatch& coarseGrid, const torch::optional<JaggedTensor>& subdivMask, nanovdb::Coord subdivFactor);
+    void buildFineFromCoarseGrid(const GridBatch                     &coarseGrid,
+                                 const torch::optional<JaggedTensor> &subdivMask,
+                                 nanovdb::Coord                       subdivFactor);
 
-    void buildDualFromPrimalGrid(const GridBatch& primalGrid, bool excludeBorder = false);
+    void buildDualFromPrimalGrid(const GridBatch &primalGrid, bool excludeBorder = false);
 
     c10::intrusive_ptr<detail::GridBatchImpl> mImpl;
 };
 
-
 // using GridBatchPtr = c10::intrusive_ptr<GridBatch>;
 
 } // namespace fvdb
+
+#endif // FVDB_GRIDBATCH_H
\ No newline at end of file
diff --git a/fvdb/src/JaggedTensor.cpp b/fvdb/src/JaggedTensor.cpp
index 5486aa10e4..9f1531c5a1 100644
--- a/fvdb/src/JaggedTensor.cpp
+++ b/fvdb/src/JaggedTensor.cpp
@@ -2,49 +2,61 @@
 // SPDX-License-Identifier: MPL-2.0
 //
 #include "JaggedTensor.h"
+
 #include "Config.h"
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
 #include "detail/autograd/JaggedReduce.h"
+#include "detail/ops/Ops.h"
 #include "detail/ops/jagged/JaggedOps.h"
+#include "detail/utils/Utils.h"
 
 namespace fvdb {
 
-void JaggedTensor::binary_op_check(const JaggedTensor& other) const {
-    TORCH_CHECK(this->device() == other.device(), "device should match between this tensor and other tensor");
-    TORCH_CHECK(mData.sizes().equals(other.jdata().sizes()), "data shape should match between this tensor and other tensor");
-    TORCH_CHECK(mBatchIdx.sizes().equals(other.jidx().sizes()), "batch indices' shape should match between this tensor and other tensor");
-    TORCH_CHECK(mOffsets.sizes().equals(other.joffsets().sizes()), "offsets shape should match between this tensor and other tensor");
+void
+JaggedTensor::binary_op_check(const JaggedTensor &other) const {
+    TORCH_CHECK(this->device() == other.device(),
+                "device should match between this tensor and other tensor");
+    TORCH_CHECK(mData.sizes().equals(other.jdata().sizes()),
+                "data shape should match between this tensor and other tensor");
+    TORCH_CHECK(mBatchIdx.sizes().equals(other.jidx().sizes()),
+                "batch indices' shape should match between this tensor and other tensor");
+    TORCH_CHECK(mOffsets.sizes().equals(other.joffsets().sizes()),
+                "offsets shape should match between this tensor and other tensor");
     if (Config::global().pendanticErrorCheckingEnabled()) {
         // This is a slow check that we cap optionally do for correctness.
-        TORCH_CHECK_VALUE(torch::equal(mOffsets, other.joffsets()), "offsets shape should match between this tensor and other tensor");
-        TORCH_CHECK_VALUE(torch::equal(other.mListIdx, mListIdx),
-                "JaggedTensors must have the same lshape. ",
-                "This error was raised because config.pendatic_error_checking was enabled");
+        TORCH_CHECK_VALUE(torch::equal(mOffsets, other.joffsets()),
+                          "offsets shape should match between this tensor and other tensor");
+        TORCH_CHECK_VALUE(
+            torch::equal(other.mListIdx, mListIdx), "JaggedTensors must have the same lshape. ",
+            "This error was raised because config.pendatic_error_checking was enabled");
     }
 }
 
-torch::Tensor JaggedTensor::joffsets_from_jidx_and_jdata(torch::Tensor jidx, torch::Tensor jdata, int64_t num_tensors) {
+torch::Tensor
+JaggedTensor::joffsets_from_jidx_and_jdata(torch::Tensor jidx, torch::Tensor jdata,
+                                           int64_t num_tensors) {
     return FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
         return detail::ops::dispatchJOffsetsForJIdx<DeviceTag>(jidx, jdata, num_tensors);
     });
 }
 
-torch::Tensor JaggedTensor::jidx_from_joffsets(torch::Tensor joffsets, int64_t num_elements) {
+torch::Tensor
+JaggedTensor::jidx_from_joffsets(torch::Tensor joffsets, int64_t num_elements) {
     return FVDB_DISPATCH_KERNEL_DEVICE(joffsets.device(), [&]() {
         return detail::ops::dispatchJIdxForJOffsets<DeviceTag>(joffsets, num_elements);
     });
 }
 
 JaggedTensor::JaggedTensor(torch::Tensor data)
-    : mData(data), mBatchIdx(torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(data.device()))) {
-    mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
-    mOffsets = joffsets_from_jidx_and_jdata(mBatchIdx, mData, 1);
+    : mData(data), mBatchIdx(torch::empty(
+                       { 0 }, torch::TensorOptions().dtype(JIdxScalarType).device(data.device()))) {
+    mListIdx =
+        torch::empty({ 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
+    mOffsets       = joffsets_from_jidx_and_jdata(mBatchIdx, mData, 1);
     mNumOuterLists = 1;
 }
 
-JaggedTensor::JaggedTensor(const std::vector<torch::Tensor>& tensors) {
+JaggedTensor::JaggedTensor(const std::vector<torch::Tensor> &tensors) {
     // TODO: (Francis): rewrite as a cuda kernel
     TORCH_CHECK(tensors.size() > 0, "empty tensor list");
 
@@ -56,10 +68,15 @@ JaggedTensor::JaggedTensor(const std::vector<torch::Tensor>& tensors) {
         if (tensors[0].dim() == 0) {
             mData = mData.unsqueeze(0);
         }
-        TORCH_CHECK(mData.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
-        mBatchIdx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
-        mOffsets = torch::tensor({JOffsetsType(0), mData.size(0)}, torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
-        mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
+        TORCH_CHECK(mData.dim() > 0,
+                    "assigned data must have shape [N, ...], but got data.dim() = 0");
+        mBatchIdx = torch::empty(
+            { 0 }, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
+        mOffsets =
+            torch::tensor({ JOffsetsType(0), mData.size(0) },
+                          torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
+        mListIdx = torch::empty(
+            { 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
         mNumOuterLists = 1;
         return;
     }
@@ -67,45 +84,49 @@ JaggedTensor::JaggedTensor(const std::vector<torch::Tensor>& tensors) {
     torch::Device device = tensors[0].device();
 
     std::vector<torch::Tensor> jIdxs;
-    mOffsets = torch::empty({(JOffsetsType) tensors.size() + 1}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
+    mOffsets              = torch::empty({ (JOffsetsType)tensors.size() + 1 },
+                                         torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
     auto elementCountsAcc = mOffsets.accessor<JOffsetsType, 1>();
-    elementCountsAcc[0] = 0;
+    elementCountsAcc[0]   = 0;
 
     jIdxs.reserve(tensors.size());
-    std::vector<torch::Tensor> tensorsReshaped;  // Reshape 0D tensors to 1D
+    std::vector<torch::Tensor> tensorsReshaped; // Reshape 0D tensors to 1D
     tensorsReshaped.reserve(tensors.size());
     for (size_t i = 0; i < tensors.size(); ++i) {
         TORCH_CHECK_VALUE(tensors[i].device() == device, "All tensors must be on the same device");
         if (tensors[i].dim() == 0 && tensors[i].numel() == 1) {
-            tensorsReshaped.push_back(tensors[i].view({1}));
+            tensorsReshaped.push_back(tensors[i].view({ 1 }));
         } else {
             tensorsReshaped.push_back(tensors[i]);
         }
-        jIdxs.push_back(torch::full({tensorsReshaped[i].size(0)}, (int) i, torch::TensorOptions().dtype(JIdxScalarType).device(tensorsReshaped[i].device())));
-        elementCountsAcc[i+1] = tensorsReshaped[i].size(0);
+        jIdxs.push_back(torch::full(
+            { tensorsReshaped[i].size(0) }, (int)i,
+            torch::TensorOptions().dtype(JIdxScalarType).device(tensorsReshaped[i].device())));
+        elementCountsAcc[i + 1] = tensorsReshaped[i].size(0);
     }
     mOffsets = mOffsets.to(tensors[0].device());
     torch::cumsum_out(mOffsets, mOffsets, 0);
     mBatchIdx = torch::cat(jIdxs, 0);
-    mData = torch::cat(tensorsReshaped, 0);
-    mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(device));
+    mData     = torch::cat(tensorsReshaped, 0);
+    mListIdx = torch::empty({ 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(device));
     mNumOuterLists = tensors.size();
 }
 
-JaggedTensor::JaggedTensor(const std::vector<std::vector<torch::Tensor>>& tensors) {
+JaggedTensor::JaggedTensor(const std::vector<std::vector<torch::Tensor>> &tensors) {
     // TODO: (Francis): rewrite as a cuda kernel
-    torch::Device device = torch::kCPU;
-    bool deviceIsNotSet = true;
-    JOffsetsType totalTensors = 0;
+    torch::Device device         = torch::kCPU;
+    bool          deviceIsNotSet = true;
+    JOffsetsType  totalTensors   = 0;
 
     TORCH_CHECK(tensors.size() > 0, "empty tensor list");
     for (size_t i = 0; i < tensors.size(); ++i) {
         for (size_t j = 0; j < tensors[i].size(); j += 1) {
             if (deviceIsNotSet) {
-                device = tensors[i][j].device();
+                device         = tensors[i][j].device();
                 deviceIsNotSet = false;
             }
-            TORCH_CHECK_VALUE(tensors[i][j].device() == device, "All tensors must be on the same device");
+            TORCH_CHECK_VALUE(tensors[i][j].device() == device,
+                              "All tensors must be on the same device");
             totalTensors += 1;
         }
     }
@@ -113,16 +134,23 @@ JaggedTensor::JaggedTensor(const std::vector<std::vector<torch::Tensor>>& tensor
     // This is an implementation detail where we don't store jidx for
     // a single list since everything is just zero by default.
     if (totalTensors == 1) {
-        TORCH_CHECK(tensors.size() == 1, "Single tensor must be a 1D tensor. This should never happen.");
-        TORCH_CHECK(tensors[0].size() == 1, "Single tensor must be a 1D tensor. This should never happen.");
+        TORCH_CHECK(tensors.size() == 1,
+                    "Single tensor must be a 1D tensor. This should never happen.");
+        TORCH_CHECK(tensors[0].size() == 1,
+                    "Single tensor must be a 1D tensor. This should never happen.");
         mData = tensors[0][0];
         if (mData.dim() == 0) {
             mData = mData.unsqueeze(0);
         }
-        TORCH_CHECK(mData.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
-        mBatchIdx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
-        mOffsets = torch::tensor({JOffsetsType(0), mData.size(0)}, torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
-        mListIdx = torch::zeros({1, 2}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
+        TORCH_CHECK(mData.dim() > 0,
+                    "assigned data must have shape [N, ...], but got data.dim() = 0");
+        mBatchIdx = torch::empty(
+            { 0 }, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
+        mOffsets =
+            torch::tensor({ JOffsetsType(0), mData.size(0) },
+                          torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
+        mListIdx = torch::zeros(
+            { 1, 2 }, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
         mNumOuterLists = 1;
         return;
     }
@@ -131,14 +159,17 @@ JaggedTensor::JaggedTensor(const std::vector<std::vector<torch::Tensor>>& tensor
     std::vector<torch::Tensor> batchIdxs;
     batchIdxs.reserve(totalTensors);
 
-    mOffsets = torch::empty({totalTensors + 1}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
+    mOffsets              = torch::empty({ totalTensors + 1 },
+                                         torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
     auto elementCountsAcc = mOffsets.accessor<JOffsetsType, 1>();
-    elementCountsAcc[0] = 0;
+    elementCountsAcc[0]   = 0;
 
-    torch::Tensor listIndexes = torch::empty({totalTensors, (JLIdxType) 2}, torch::TensorOptions().dtype(JLIdxScalarType).device(torch::kCPU));
+    torch::Tensor listIndexes =
+        torch::empty({ totalTensors, (JLIdxType)2 },
+                     torch::TensorOptions().dtype(JLIdxScalarType).device(torch::kCPU));
     auto listIndexesAcc = listIndexes.accessor<JLIdxType, 2>();
 
-    std::vector<torch::Tensor> tensorsReshaped;  // Reshape 0D tensors to 1D
+    std::vector<torch::Tensor> tensorsReshaped; // Reshape 0D tensors to 1D
     tensorsReshaped.reserve(totalTensors);
 
     int64_t tensorCount = 0;
@@ -149,51 +180,59 @@ JaggedTensor::JaggedTensor(const std::vector<std::vector<torch::Tensor>>& tensor
 
             torch::Tensor tij = tensors[i][j];
             if (tij.dim() == 0 && tij.numel() == 1) {
-                tensorsReshaped.push_back(tij.view({1}));
+                tensorsReshaped.push_back(tij.view({ 1 }));
             } else {
                 tensorsReshaped.push_back(tij);
             }
-            batchIdxs.push_back(torch::full({tensorsReshaped[tensorCount].size(0)},
-                                tensorCount,
-                                torch::TensorOptions().dtype(JIdxScalarType).device(device)));
-            elementCountsAcc[tensorCount+1] = tensorsReshaped[tensorCount].size(0);
+            batchIdxs.push_back(
+                torch::full({ tensorsReshaped[tensorCount].size(0) }, tensorCount,
+                            torch::TensorOptions().dtype(JIdxScalarType).device(device)));
+            elementCountsAcc[tensorCount + 1] = tensorsReshaped[tensorCount].size(0);
             tensorCount += 1;
-
         }
     }
 
     mOffsets = mOffsets.to(device);
     torch::cumsum_out(mOffsets, mOffsets, 0);
-    mBatchIdx = torch::cat(batchIdxs, 0);
-    mData = torch::cat(tensorsReshaped, 0);
-    mListIdx = listIndexes.to(device);
+    mBatchIdx      = torch::cat(batchIdxs, 0);
+    mData          = torch::cat(tensorsReshaped, 0);
+    mListIdx       = listIndexes.to(device);
     mNumOuterLists = tensors.size();
 }
 
-JaggedTensor::JaggedTensor(const std::vector<int64_t>& lsizes, const torch::Tensor data) {
+JaggedTensor::JaggedTensor(const std::vector<int64_t> &lsizes, const torch::Tensor data) {
     // TODO: (Francis): rewrite as a cuda kernel
     TORCH_CHECK_VALUE(lsizes.size() > 0, "empty list sizes");
 
     // This is an implementation detail where we don't store jidx for
     // a single list since everything is just zero by default.
     if (lsizes.size() == 1) {
-        TORCH_CHECK_VALUE(lsizes[0] == data.size(0), "Sum of list sizes must equal the number of elements in data");
-        mOffsets = torch::tensor({JOffsetsType(0), data.size(0)}, torch::TensorOptions().dtype(JOffsetsScalarType).device(data.device()));
-        mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
+        TORCH_CHECK_VALUE(lsizes[0] == data.size(0),
+                          "Sum of list sizes must equal the number of elements in data");
+        mOffsets =
+            torch::tensor({ JOffsetsType(0), data.size(0) },
+                          torch::TensorOptions().dtype(JOffsetsScalarType).device(data.device()));
+        mListIdx = torch::empty(
+            { 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
         mNumOuterLists = 1;
-        mBatchIdx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(data.device()));
+        mBatchIdx =
+            torch::empty({ 0 }, torch::TensorOptions().dtype(JIdxScalarType).device(data.device()));
         mData = data;
         if (mData.dim() == 0) {
             mData = mData.unsqueeze(0);
         }
-        TORCH_CHECK(mData.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
+        TORCH_CHECK(mData.dim() > 0,
+                    "assigned data must have shape [N, ...], but got data.dim() = 0");
         return;
     }
 
-    torch::Tensor offsetsCPU = torch::empty({(JOffsetsType) lsizes.size() + 1}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
+    torch::Tensor offsetsCPU =
+        torch::empty({ (JOffsetsType)lsizes.size() + 1 },
+                     torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
     auto offsetsCPUAcc = offsetsCPU.accessor<JOffsetsType, 1>();
 
-    mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
+    mListIdx =
+        torch::empty({ 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(data.device()));
     mNumOuterLists = lsizes.size();
 
     JOffsetsType cumulativeElements = 0;
@@ -202,46 +241,59 @@ JaggedTensor::JaggedTensor(const std::vector<int64_t>& lsizes, const torch::Tens
         cumulativeElements += lsizes[i];
     }
     offsetsCPUAcc[lsizes.size()] = cumulativeElements;
-    TORCH_CHECK_VALUE(cumulativeElements == data.size(0), "Sum of list sizes must equal the number of elements in data");
+    TORCH_CHECK_VALUE(cumulativeElements == data.size(0),
+                      "Sum of list sizes must equal the number of elements in data");
 
-    mOffsets = offsetsCPU.to(data.device());
-    mData = data;
+    mOffsets  = offsetsCPU.to(data.device());
+    mData     = data;
     mBatchIdx = jidx_from_joffsets(mOffsets, data.size(0));
 }
 
-JaggedTensor::JaggedTensor(const std::vector<std::vector<int64_t>>& lsizes, const int64_t totalTensors, const torch::Tensor data) {
+JaggedTensor::JaggedTensor(const std::vector<std::vector<int64_t>> &lsizes,
+                           const int64_t totalTensors, const torch::Tensor data) {
     // TODO (Francis) : Rewrite as a cuda kernel
     TORCH_CHECK_VALUE(lsizes.size() > 0, "empty lshape");
 
     // This is an implementation detail where we don't store jidx for
     // a single list since everything is just zero by default.
     if (totalTensors == 1) {
-        TORCH_CHECK(lsizes.size() == 1, "Single tensor must be a 1D tensor. This should never happen.");
-        TORCH_CHECK(lsizes[0].size() == 1, "Single tensor must be a 1D tensor. This should never happen.");
+        TORCH_CHECK(lsizes.size() == 1,
+                    "Single tensor must be a 1D tensor. This should never happen.");
+        TORCH_CHECK(lsizes[0].size() == 1,
+                    "Single tensor must be a 1D tensor. This should never happen.");
         TORCH_CHECK_VALUE(lsizes[0][0] == data.size(0), "Invalid size for data tensor.");
         mData = data;
         if (mData.dim() == 0) {
             mData = mData.unsqueeze(0);
         }
-        TORCH_CHECK(mData.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
-        mBatchIdx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
-        mOffsets = torch::tensor({JOffsetsType(0), mData.size(0)}, torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
-        mListIdx = torch::zeros({1, 2}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
+        TORCH_CHECK(mData.dim() > 0,
+                    "assigned data must have shape [N, ...], but got data.dim() = 0");
+        mBatchIdx = torch::empty(
+            { 0 }, torch::TensorOptions().dtype(JIdxScalarType).device(mData.device()));
+        mOffsets =
+            torch::tensor({ JOffsetsType(0), mData.size(0) },
+                          torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
+        mListIdx = torch::zeros(
+            { 1, 2 }, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device()));
         mNumOuterLists = 1;
         return;
     }
 
-    torch::Tensor offsetsCPU = torch::empty({(JOffsetsType) totalTensors + 1}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
-    torch::Tensor listIdsCPU = torch::empty({(JLIdxType) totalTensors, 2}, torch::TensorOptions().dtype(JLIdxScalarType).device(torch::kCPU));
+    torch::Tensor offsetsCPU =
+        torch::empty({ (JOffsetsType)totalTensors + 1 },
+                     torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU));
+    torch::Tensor listIdsCPU =
+        torch::empty({ (JLIdxType)totalTensors, 2 },
+                     torch::TensorOptions().dtype(JLIdxScalarType).device(torch::kCPU));
     auto offsetsCPUAcc = offsetsCPU.accessor<JOffsetsType, 1>();
     auto listIdsCPUAcc = listIdsCPU.accessor<JLIdxType, 2>();
 
     JOffsetsType cumulativeElements = 0;
-    int64_t tensorCount = 0;
+    int64_t      tensorCount        = 0;
     for (size_t i = 0; i < lsizes.size(); ++i) {
         TORCH_CHECK_VALUE(lsizes[i].size() > 0, "empty lshape");
         for (size_t j = 0; j < lsizes[i].size(); j += 1) {
-            offsetsCPUAcc[tensorCount] = cumulativeElements;
+            offsetsCPUAcc[tensorCount]    = cumulativeElements;
             listIdsCPUAcc[tensorCount][0] = i;
             listIdsCPUAcc[tensorCount][1] = j;
             cumulativeElements += lsizes[i][j];
@@ -249,26 +301,28 @@ JaggedTensor::JaggedTensor(const std::vector<std::vector<int64_t>>& lsizes, cons
         }
     }
     offsetsCPUAcc[totalTensors] = cumulativeElements;
-    TORCH_CHECK_VALUE(cumulativeElements == data.size(0), "Sum of list sizes must equal the number of elements in data");
+    TORCH_CHECK_VALUE(cumulativeElements == data.size(0),
+                      "Sum of list sizes must equal the number of elements in data");
 
-    mOffsets = offsetsCPU.to(data.device());
-    mListIdx = listIdsCPU.to(data.device());
-    mBatchIdx = jidx_from_joffsets(mOffsets, data.size(0));
-    mData = data;
+    mOffsets       = offsetsCPU.to(data.device());
+    mListIdx       = listIdsCPU.to(data.device());
+    mBatchIdx      = jidx_from_joffsets(mOffsets, data.size(0));
+    mData          = data;
     mNumOuterLists = lsizes.size();
 }
 
-void JaggedTensor::recompute_lsizes_if_dirty() {
+void
+JaggedTensor::recompute_lsizes_if_dirty() {
     if (!mLShapeCache.mDirty) {
         return;
     }
     mLShapeCache.clear();
     if (ldim() == 1) {
         const torch::Tensor offsetsCpu = mOffsets.cpu();
-        const auto acc = offsetsCpu.accessor<JOffsetsType, 1>();
+        const auto          acc        = offsetsCpu.accessor<JOffsetsType, 1>();
         for (int i = 0; i < num_tensors(); ++i) {
             const JOffsetsType startIdx = acc[i];
-            const JOffsetsType endIdx = acc[i+1];
+            const JOffsetsType endIdx   = acc[i + 1];
             mLShapeCache.mLShape1.push_back(endIdx - startIdx);
         }
         mLShapeCache.mDirty = false;
@@ -276,8 +330,8 @@ void JaggedTensor::recompute_lsizes_if_dirty() {
     } else if (ldim() == 2) {
         const torch::Tensor offsetsCpu = mOffsets.cpu();
         const torch::Tensor listIdxCpu = mListIdx.cpu();
-        const auto offAcc = offsetsCpu.accessor<JOffsetsType, 1>();
-        const auto lixAcc = listIdxCpu.accessor<JLIdxType, 2>();
+        const auto          offAcc     = offsetsCpu.accessor<JOffsetsType, 1>();
+        const auto          lixAcc     = listIdxCpu.accessor<JLIdxType, 2>();
 
         ssize_t currentList = -1;
         for (int i = 0; i < num_tensors(); ++i) {
@@ -288,37 +342,41 @@ void JaggedTensor::recompute_lsizes_if_dirty() {
                 mLShapeCache.mLShape2.push_back(std::vector<int64_t>());
             }
             const JOffsetsType startIdx = offAcc[i];
-            const JOffsetsType endIdx = offAcc[i+1];
+            const JOffsetsType endIdx   = offAcc[i + 1];
             mLShapeCache.mLShape2.back().push_back(endIdx - startIdx);
         }
         mLShapeCache.mDirty = false;
         return;
     } else {
-        TORCH_CHECK(false, "Unsupported list dimension. Currently JaggedTensor only supports up to 2.");
+        TORCH_CHECK(false,
+                    "Unsupported list dimension. Currently JaggedTensor only supports up to 2.");
     }
 }
 
-std::vector<torch::Tensor> JaggedTensor::unbind1() const {
+std::vector<torch::Tensor>
+JaggedTensor::unbind1() const {
     std::vector<torch::Tensor> ret(num_tensors());
 
     int64_t ldim = mListIdx.size(1);
     if (ldim != 1) {
-        TORCH_WARN("Calling unbind on a multidimensional list of jagged tensors will return a flattened list");
+        TORCH_WARN(
+            "Calling unbind on a multidimensional list of jagged tensors will return a flattened list");
     }
 
     torch::Tensor offsetsCpu = mOffsets.cpu();
-    auto acc = offsetsCpu.accessor<JOffsetsType, 1>();
+    auto          acc        = offsetsCpu.accessor<JOffsetsType, 1>();
     for (int i = 0; i < num_tensors(); ++i) {
         const JOffsetsType startIdx = acc[i];
-        const JOffsetsType endIdx = acc[i+1];
+        const JOffsetsType endIdx   = acc[i + 1];
 
-        ret[i] = mData.index({torch::indexing::Slice(startIdx, endIdx)});
+        ret[i] = mData.index({ torch::indexing::Slice(startIdx, endIdx) });
     }
 
     return ret;
 }
 
-std::vector<std::vector<torch::Tensor>> JaggedTensor::unbind2() const {
+std::vector<std::vector<torch::Tensor>>
+JaggedTensor::unbind2() const {
     std::vector<std::vector<torch::Tensor>> ret;
 
     int64_t ldim = mListIdx.size(1);
@@ -327,9 +385,9 @@ std::vector<std::vector<torch::Tensor>> JaggedTensor::unbind2() const {
         TORCH_CHECK_VALUE(false, "Called unbind2() on a list with list dimension != 2");
     }
 
-    torch::Tensor listIdxCpu = mListIdx.cpu();
-    torch::Tensor offsetsCpu = mOffsets.cpu();
-    ssize_t currentList = -1;
+    torch::Tensor listIdxCpu  = mListIdx.cpu();
+    torch::Tensor offsetsCpu  = mOffsets.cpu();
+    ssize_t       currentList = -1;
 
     auto offAcc = offsetsCpu.accessor<JOffsetsType, 1>();
     auto lixAcc = listIdxCpu.accessor<JLIdxType, 2>();
@@ -342,34 +400,38 @@ std::vector<std::vector<torch::Tensor>> JaggedTensor::unbind2() const {
             ret.push_back(std::vector<torch::Tensor>());
         }
         const JOffsetsType startIdx = offAcc[i];
-        const JOffsetsType endIdx = offAcc[i+1];
-
+        const JOffsetsType endIdx   = offAcc[i + 1];
 
-        ret.back().push_back(mData.index({torch::indexing::Slice(startIdx, endIdx)}));
+        ret.back().push_back(mData.index({ torch::indexing::Slice(startIdx, endIdx) }));
     }
 
     return ret;
 }
 
-std::vector<int64_t> JaggedTensor::lsizes1() const {
+std::vector<int64_t>
+JaggedTensor::lsizes1() const {
     TORCH_CHECK(ldim() == 1, "Nesting dimension must be 1");
-    const_cast<JaggedTensor*>(this)->recompute_lsizes_if_dirty();
+    const_cast<JaggedTensor *>(this)->recompute_lsizes_if_dirty();
     return mLShapeCache.mLShape1;
 }
 
-std::vector<std::vector<int64_t>> JaggedTensor::lsizes2() const {
+std::vector<std::vector<int64_t>>
+JaggedTensor::lsizes2() const {
     TORCH_CHECK(ldim() == 2, "Nesting dimension must be 2");
-    const_cast<JaggedTensor*>(this)->recompute_lsizes_if_dirty();
+    const_cast<JaggedTensor *>(this)->recompute_lsizes_if_dirty();
     return mLShapeCache.mLShape2;
 }
 
-int64_t JaggedTensor::ldim() const {
+int64_t
+JaggedTensor::ldim() const {
     TORCH_CHECK_VALUE(mListIdx.dim() == 2, "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1), "Corrupt list indices. This should never happen");
+    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1),
+                      "Corrupt list indices. This should never happen");
     return mListIdx.size(1);
 }
 
-std::vector<int64_t> JaggedTensor::esizes() const {
+std::vector<int64_t>
+JaggedTensor::esizes() const {
     std::vector<int64_t> sizes;
     for (size_t i = 1; i < mData.sizes().size(); i++) {
         sizes.push_back(mData.size(i));
@@ -377,111 +439,148 @@ std::vector<int64_t> JaggedTensor::esizes() const {
     return sizes;
 }
 
-int64_t JaggedTensor::edim() const {
+int64_t
+JaggedTensor::edim() const {
     return mData.dim() > 0 ? mData.dim() - 1 : 0;
 }
 
-
-JaggedTensor JaggedTensor::jagged_like(torch::Tensor data) const {
-    TORCH_CHECK_VALUE(data.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
+JaggedTensor
+JaggedTensor::jagged_like(torch::Tensor data) const {
+    TORCH_CHECK_VALUE(data.dim() > 0,
+                      "assigned data must have shape [N, ...], but got data.dim() = 0");
     TORCH_CHECK_VALUE(mListIdx.dim() == 2, "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1), "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(data.size(0) == mData.size(0), "Assigned data must have the same number of elements as the JaggedTensor");
+    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1),
+                      "Corrupt list indices. This should never happen");
+    TORCH_CHECK_VALUE(data.size(0) == mData.size(0),
+                      "Assigned data must have the same number of elements as the JaggedTensor");
 
     JaggedTensor ret;
-    ret.mBatchIdx = jidx();
-    ret.mOffsets = joffsets();
-    ret.mListIdx = jlidx();
+    ret.mBatchIdx      = jidx();
+    ret.mOffsets       = joffsets();
+    ret.mListIdx       = jlidx();
     ret.mNumOuterLists = mNumOuterLists;
-    ret.mData = data.to(device());
-    ret.mLShapeCache = mLShapeCache;
+    ret.mData          = data.to(device());
+    ret.mLShapeCache   = mLShapeCache;
     return ret;
 }
 
-JaggedTensor JaggedTensor::from_data_indices_and_list_ids(torch::Tensor data, torch::Tensor indices, torch::Tensor list_ids, int64_t num_tensors) {
+JaggedTensor
+JaggedTensor::from_data_indices_and_list_ids(torch::Tensor data, torch::Tensor indices,
+                                             torch::Tensor list_ids, int64_t num_tensors) {
     JaggedTensor ret;
-    ret.mData = data;
-    ret.mBatchIdx = indices;
-    ret.mListIdx = list_ids;
-    ret.mOffsets = joffsets_from_jidx_and_jdata(indices, data, num_tensors);
+    ret.mData          = data;
+    ret.mBatchIdx      = indices;
+    ret.mListIdx       = list_ids;
+    ret.mOffsets       = joffsets_from_jidx_and_jdata(indices, data, num_tensors);
     ret.mNumOuterLists = ret.joffsets().size(0) - 1;
     ret.mLShapeCache.markDirty();
     return ret;
 }
 
-JaggedTensor JaggedTensor::from_data_offsets_and_list_ids(torch::Tensor data, torch::Tensor offsets, torch::Tensor list_ids) {
-    TORCH_CHECK_VALUE(list_ids.dim() == 2, "Invalid list indices when constructing JaggedTensor from data, offsets, and list indices");
-    TORCH_CHECK_VALUE(list_ids.numel() == 0 || list_ids.size(0) == (offsets.size(0) - 1), "Invalid list indices when constructing JaggedTensor from data, offsets, and list indices");
-    TORCH_CHECK_VALUE(offsets.dim() == 1, "Invalid offsets when constructing JaggedTensor from data, offsets, and list indices");
+JaggedTensor
+JaggedTensor::from_data_offsets_and_list_ids(torch::Tensor data, torch::Tensor offsets,
+                                             torch::Tensor list_ids) {
+    TORCH_CHECK_VALUE(
+        list_ids.dim() == 2,
+        "Invalid list indices when constructing JaggedTensor from data, offsets, and list indices");
+    TORCH_CHECK_VALUE(
+        list_ids.numel() == 0 || list_ids.size(0) == (offsets.size(0) - 1),
+        "Invalid list indices when constructing JaggedTensor from data, offsets, and list indices");
+    TORCH_CHECK_VALUE(
+        offsets.dim() == 1,
+        "Invalid offsets when constructing JaggedTensor from data, offsets, and list indices");
 
     JaggedTensor ret;
-    ret.mData = data;
-    ret.mOffsets = offsets;
-    ret.mListIdx = list_ids;
+    ret.mData          = data;
+    ret.mOffsets       = offsets;
+    ret.mListIdx       = list_ids;
     ret.mNumOuterLists = offsets.size(0) - 1;
-    ret.mBatchIdx = jidx_from_joffsets(offsets, data.size(0));
+    ret.mBatchIdx      = jidx_from_joffsets(offsets, data.size(0));
     ret.mLShapeCache.markDirty();
     return ret;
 }
 
-JaggedTensor JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(torch::Tensor jdata, torch::Tensor joffsets,
-                                                                    torch::Tensor jidx, torch::Tensor lidx,
-                                                                    int64_t numOuterLists) {
-    TORCH_CHECK_VALUE(lidx.dim() == 2, "Invalid list indices when constructing JaggedTensor from data, offsets, indices, list indices");
-    TORCH_CHECK_VALUE(lidx.numel() == 0 || lidx.size(0) == (joffsets.size(0) - 1), "Invalid list indices when constructing JaggedTensor from data, offsets, indices, list indices");
-    TORCH_CHECK_VALUE(joffsets.dim() == 1, "Invalid offsets when constructing JaggedTensor from data, offsets, indices, list indices");
+JaggedTensor
+JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(torch::Tensor jdata, torch::Tensor joffsets,
+                                                       torch::Tensor jidx, torch::Tensor lidx,
+                                                       int64_t numOuterLists) {
+    TORCH_CHECK_VALUE(
+        lidx.dim() == 2,
+        "Invalid list indices when constructing JaggedTensor from data, offsets, indices, list indices");
+    TORCH_CHECK_VALUE(
+        lidx.numel() == 0 || lidx.size(0) == (joffsets.size(0) - 1),
+        "Invalid list indices when constructing JaggedTensor from data, offsets, indices, list indices");
+    TORCH_CHECK_VALUE(
+        joffsets.dim() == 1,
+        "Invalid offsets when constructing JaggedTensor from data, offsets, indices, list indices");
     JaggedTensor ret;
-    ret.mData = jdata;
-    ret.mOffsets = joffsets;
-    ret.mListIdx = lidx;
+    ret.mData          = jdata;
+    ret.mOffsets       = joffsets;
+    ret.mListIdx       = lidx;
     ret.mNumOuterLists = numOuterLists;
-    ret.mBatchIdx = jidx;
+    ret.mBatchIdx      = jidx;
     ret.mLShapeCache.markDirty();
     ret.recompute_lsizes_if_dirty();
     return ret;
 }
 
-void JaggedTensor::set_data(const torch::Tensor& data) {
-    TORCH_CHECK_VALUE(data.dim() > 0, "assigned data must have shape [N, ...], but got data.dim() = 0");
-    TORCH_CHECK_VALUE((data.device() == mBatchIdx.device()) || (mBatchIdx.numel() == 0 && num_tensors() == 1), "Incorrect device for data");
+void
+JaggedTensor::set_data(const torch::Tensor &data) {
+    TORCH_CHECK_VALUE(data.dim() > 0,
+                      "assigned data must have shape [N, ...], but got data.dim() = 0");
+    TORCH_CHECK_VALUE((data.device() == mBatchIdx.device()) ||
+                          (mBatchIdx.numel() == 0 && num_tensors() == 1),
+                      "Incorrect device for data");
     TORCH_CHECK_VALUE(data.device() == mOffsets.device(), "Incorrect device for data");
     TORCH_CHECK_VALUE(mListIdx.dim() == 2, "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1), "Corrupt list indices. This should never happen");
+    TORCH_CHECK_VALUE(mListIdx.numel() == 0 || mListIdx.size(0) == (mOffsets.size(0) - 1),
+                      "Corrupt list indices. This should never happen");
 
     if (mBatchIdx.size(0) == 0) {
         TORCH_CHECK(mOffsets.dim() == 1, "bad offsets. this should never happen");
-        TORCH_CHECK(mOffsets.size(0) == (num_outer_lists() + 1), "bad offsets. this should never happen");
+        TORCH_CHECK(mOffsets.size(0) == (num_outer_lists() + 1),
+                    "bad offsets. this should never happen");
         TORCH_CHECK_VALUE(data.size(0) == mData.size(0), "assigned data must have shape [N, ...]");
     } else {
-        TORCH_CHECK_VALUE(data.size(0) == mBatchIdx.size(0), "assigned data must have shape [N, ...]");
+        TORCH_CHECK_VALUE(data.size(0) == mBatchIdx.size(0),
+                          "assigned data must have shape [N, ...]");
     }
     mData = data;
 }
 
-JaggedTensor JaggedTensor::rmask(const torch::Tensor& mask) const {
-    TORCH_CHECK(mask.device() == mBatchIdx.device(), "mask must be on the same device as the JaggedTensor");
+JaggedTensor
+JaggedTensor::rmask(const torch::Tensor &mask) const {
+    TORCH_CHECK(mask.device() == mBatchIdx.device(),
+                "mask must be on the same device as the JaggedTensor");
     TORCH_CHECK(mask.dim() == 1, "mask must be 1-dimensional");
-    TORCH_CHECK(mask.size(0) == mData.size(0), "mask must have the same size as the first dimension of the JaggedTensor");
+    TORCH_CHECK(mask.size(0) == mData.size(0),
+                "mask must have the same size as the first dimension of the JaggedTensor");
     TORCH_CHECK(mask.scalar_type() == torch::kBool, "mask must be of type bool");
 
-    TORCH_CHECK((mask.size(0) == mBatchIdx.size(0)) || (mBatchIdx.size(0) == 0 && mOffsets.size(0) == 2),
-                "Bad jidx. This should never happen. mask.size(0) = ", mask.size(0), " mBatchIdx.size(0) = ", mBatchIdx.size(0));
-    const torch::Tensor retData = mData.index({mask, "..."});
-    const torch::Tensor retBatchIds = mBatchIdx.size(0) > 0 ? mBatchIdx.index({mask}) : mBatchIdx;
-    const torch::Tensor retOffsets = joffsets_from_jidx_and_jdata(retBatchIds, retData, num_tensors());
-    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retBatchIds, mListIdx, mNumOuterLists);
+    TORCH_CHECK((mask.size(0) == mBatchIdx.size(0)) ||
+                    (mBatchIdx.size(0) == 0 && mOffsets.size(0) == 2),
+                "Bad jidx. This should never happen. mask.size(0) = ", mask.size(0),
+                " mBatchIdx.size(0) = ", mBatchIdx.size(0));
+    const torch::Tensor retData     = mData.index({ mask, "..." });
+    const torch::Tensor retBatchIds = mBatchIdx.size(0) > 0 ? mBatchIdx.index({ mask }) : mBatchIdx;
+    const torch::Tensor retOffsets =
+        joffsets_from_jidx_and_jdata(retBatchIds, retData, num_tensors());
+    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retBatchIds,
+                                                                  mListIdx, mNumOuterLists);
 }
 
-JaggedTensor JaggedTensor::index(JaggedTensorIndex idx) const {
+JaggedTensor
+JaggedTensor::index(JaggedTensorIndex idx) const {
     if (idx.is_integer()) {
         return FVDB_DISPATCH_KERNEL_DEVICE(mData.device(), [&]() {
             return detail::ops::dispatchJaggedTensorIndex<DeviceTag>(*this, idx.integer());
         });
     } else if (idx.is_slice()) {
         int64_t start = idx.slice().start().as_int_unchecked();
-        int64_t end = idx.slice().stop().as_int_unchecked();
-        int64_t step = idx.slice().step().as_int_unchecked();
-        TORCH_CHECK_INDEX(step == 1, "step must be 1 for JaggedTensor. Only contiguous slicing is supported.");
+        int64_t end   = idx.slice().stop().as_int_unchecked();
+        int64_t step  = idx.slice().step().as_int_unchecked();
+        TORCH_CHECK_INDEX(step == 1,
+                          "step must be 1 for JaggedTensor. Only contiguous slicing is supported.");
 
         // Deal with symbolic int
         if (start >= at::indexing::INDEX_MAX) {
@@ -502,93 +601,109 @@ JaggedTensor JaggedTensor::index(JaggedTensorIndex idx) const {
             start = end;
         }
 
-        start = std::max(start, (int64_t) 0);
-        end = std::min(end, mNumOuterLists);
+        start = std::max(start, (int64_t)0);
+        end   = std::min(end, mNumOuterLists);
 
         if (mListIdx.size(0) == 0) {
             TORCH_CHECK(ldim() == 1, "bad list indexes. this should never happen");
-            const JOffsetsType startIdx = mOffsets[start].item<JOffsetsType>();
-            const JOffsetsType endIdx = mOffsets[end].item<JOffsetsType>();
-            const torch::Tensor retLidx = mListIdx.numel() == 0 ? mListIdx : mListIdx.index({torch::indexing::Slice(start, end)});
+            const JOffsetsType  startIdx = mOffsets[start].item<JOffsetsType>();
+            const JOffsetsType  endIdx   = mOffsets[end].item<JOffsetsType>();
+            const torch::Tensor retLidx =
+                mListIdx.numel() == 0 ? mListIdx
+                                      : mListIdx.index({ torch::indexing::Slice(start, end) });
             return JaggedTensor::from_data_offsets_and_list_ids(
-                mData.index({torch::indexing::Slice(startIdx, endIdx)}),
-                mOffsets.index({torch::indexing::Slice(start, end+1)}) - startIdx,
-                retLidx);
+                mData.index({ torch::indexing::Slice(startIdx, endIdx) }),
+                mOffsets.index({ torch::indexing::Slice(start, end + 1) }) - startIdx, retLidx);
         } else {
             // Find all tensors that belong to the slice
-            const torch::Tensor outerLidx = mListIdx.index({torch::indexing::Slice(), 0});
-            const torch::Tensor mask = outerLidx.ge(start).logical_and(outerLidx.lt(end));
-            const torch::Tensor joffsetCat = torch::stack({
-                mOffsets.index({torch::indexing::Slice(0, num_tensors())}),
-                mOffsets.index({torch::indexing::Slice(1, num_tensors()+1)})
-            }, 1);
-            const torch::Tensor selectedOffsets = joffsetCat.index({mask});
+            const torch::Tensor outerLidx = mListIdx.index({ torch::indexing::Slice(), 0 });
+            const torch::Tensor mask      = outerLidx.ge(start).logical_and(outerLidx.lt(end));
+            const torch::Tensor joffsetCat =
+                torch::stack({ mOffsets.index({ torch::indexing::Slice(0, num_tensors()) }),
+                               mOffsets.index({ torch::indexing::Slice(1, num_tensors() + 1) }) },
+                             1);
+            const torch::Tensor selectedOffsets = joffsetCat.index({ mask });
 
             // Get the start and end offsets into the data tensor for the slice
-            JOffsetsType startIdx = selectedOffsets.size(0) > 0 ? selectedOffsets[0][0].item<JOffsetsType>() : 0;
-            JOffsetsType endIdx = selectedOffsets.size(0) > 0 ? selectedOffsets[-1][1].item<JOffsetsType>() : 0;
+            JOffsetsType startIdx =
+                selectedOffsets.size(0) > 0 ? selectedOffsets[0][0].item<JOffsetsType>() : 0;
+            JOffsetsType endIdx =
+                selectedOffsets.size(0) > 0 ? selectedOffsets[-1][1].item<JOffsetsType>() : 0;
 
             // Slice the data tensor
-            const torch::Tensor retData = mData.index({torch::indexing::Slice(startIdx, endIdx)});
+            const torch::Tensor retData = mData.index({ torch::indexing::Slice(startIdx, endIdx) });
 
             // Subtract the start offset from the selected offsets to get the new offsets
             // NOTE: This assumes offsets are always contiguous
-            const torch::Tensor retOffsets = selectedOffsets.numel() > 0 ? torch::cat({
-                selectedOffsets.index({torch::indexing::Slice(), 0}),
-                selectedOffsets.index({-1, 1}).unsqueeze(0)
-            }) - startIdx : torch::zeros({1}, torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
+            const torch::Tensor retOffsets =
+                selectedOffsets.numel() > 0
+                    ? torch::cat({ selectedOffsets.index({ torch::indexing::Slice(), 0 }),
+                                   selectedOffsets.index({ -1, 1 }).unsqueeze(0) }) -
+                          startIdx
+                    : torch::zeros(
+                          { 1 },
+                          torch::TensorOptions().dtype(JOffsetsScalarType).device(mData.device()));
 
             // Slice the list indices and subtract the start index
             TORCH_CHECK(mListIdx.size(1) > 1, "bad list indexes. this should never happen");
-            torch::Tensor retListIdx = mListIdx.index({mask});
-            retListIdx.index({torch::indexing::Slice(), 0}) -= start;
+            torch::Tensor retListIdx = mListIdx.index({ mask });
+            retListIdx.index({ torch::indexing::Slice(), 0 }) -= start;
             if (retListIdx.dim() == 0) {
                 retListIdx = retListIdx.unsqueeze(1);
             }
-            const int64_t retNumOuterLists = end - start;
-            const torch::Tensor retJidx = jidx_from_joffsets(retOffsets, retData.size(0));
-            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx, retListIdx, retNumOuterLists);
+            const int64_t       retNumOuterLists = end - start;
+            const torch::Tensor retJidx          = jidx_from_joffsets(retOffsets, retData.size(0));
+            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+                retData, retOffsets, retJidx, retListIdx, retNumOuterLists);
         }
     } else if (idx.is_ellipsis()) {
         return *this;
     } else if (idx.is_jagged_tensor()) {
-        const JaggedTensor& jtIndices = idx.jagged_tensor();
-        TORCH_CHECK_VALUE(jtIndices.device() == device(), "indices must be on the same device as the JaggedTensor");
+        const JaggedTensor &jtIndices = idx.jagged_tensor();
+        TORCH_CHECK_VALUE(jtIndices.device() == device(),
+                          "indices must be on the same device as the JaggedTensor");
 
-        TORCH_CHECK_INDEX(jtIndices.mListIdx.dim() == mListIdx.dim(),
-                          "Indices must have the same list structure as JaggedTensor being indexed");
+        TORCH_CHECK_INDEX(
+            jtIndices.mListIdx.dim() == mListIdx.dim(),
+            "Indices must have the same list structure as JaggedTensor being indexed");
         for (int i = 0; i < mListIdx.dim(); ++i) {
-            TORCH_CHECK_INDEX(jtIndices.mListIdx.size(i) == mListIdx.size(i),
-                              "Indices must have the same list structure as JaggedTensor being indexed");
+            TORCH_CHECK_INDEX(
+                jtIndices.mListIdx.size(i) == mListIdx.size(i),
+                "Indices must have the same list structure as JaggedTensor being indexed");
         }
         if (Config::global().pendanticErrorCheckingEnabled()) {
             // This is a slow check that we cap optionally do for correctness.
-            TORCH_CHECK_INDEX(torch::all(jtIndices.mListIdx == mListIdx).item<bool>(),
-                    "Indices must have the same list structure as JaggedTensor being indexed. ",
-                    "This error was raised because config.pendatic_error_checking was enabled");
+            TORCH_CHECK_INDEX(
+                torch::all(jtIndices.mListIdx == mListIdx).item<bool>(),
+                "Indices must have the same list structure as JaggedTensor being indexed. ",
+                "This error was raised because config.pendatic_error_checking was enabled");
         }
 
-        c10::ScalarType idxdt = jtIndices.scalar_type();
+        c10::ScalarType idxdt  = jtIndices.scalar_type();
         const bool isIndexType = (idxdt == c10::ScalarType::Long || idxdt == c10::ScalarType::Int ||
                                   idxdt == c10::ScalarType::Byte || idxdt == c10::ScalarType::Bool);
-        TORCH_CHECK_INDEX(isIndexType, "JaggedTensors used as indices must be long, int, byte or bool JaggedTensors but got ", idxdt);
+        TORCH_CHECK_INDEX(
+            isIndexType,
+            "JaggedTensors used as indices must be long, int, byte or bool JaggedTensors but got ",
+            idxdt);
 
         torch::Tensor selidx;
         if (jtIndices.scalar_type() == torch::kBool) {
             selidx = jtIndices.jdata();
         } else {
-            // FIXME (Francis): We're not checking out of range here and it's sketchy! Fix in a unified CUDA kernel
+            // FIXME (Francis): We're not checking out of range here and it's sketchy! Fix in a
+            // unified CUDA kernel
             selidx = jtIndices.jdata().clone();
             for (int i = 0; i < jtIndices.joffsets().size(0) - 1; ++i) {
                 const JOffsetsType start = jtIndices.joffsets()[i].item<JOffsetsType>();
-                const JOffsetsType end = jtIndices.joffsets()[i+1].item<JOffsetsType>();
-                const JOffsetsType add = mOffsets[i].item<JOffsetsType>();
-                selidx.index({torch::indexing::Slice(start, end)}).add_(add);
+                const JOffsetsType end   = jtIndices.joffsets()[i + 1].item<JOffsetsType>();
+                const JOffsetsType add   = mOffsets[i].item<JOffsetsType>();
+                selidx.index({ torch::indexing::Slice(start, end) }).add_(add);
             }
         }
 
-        const torch::Tensor retJdata = mData.index({selidx});
-        torch::Tensor retJidx = mBatchIdx.index({selidx});
+        const torch::Tensor retJdata = mData.index({ selidx });
+        torch::Tensor       retJidx  = mBatchIdx.index({ selidx });
         if (retJidx.dim() > 1) {
             std::vector<at::indexing::TensorIndex> idx;
             idx.reserve(retJidx.dim());
@@ -599,51 +714,64 @@ JaggedTensor JaggedTensor::index(JaggedTensorIndex idx) const {
             retJidx = retJidx.index(idx);
         }
         retJidx = retJidx.contiguous();
-        const torch::Tensor retJOffsets = joffsets_from_jidx_and_jdata(retJidx, retJdata, num_tensors());
+        const torch::Tensor retJOffsets =
+            joffsets_from_jidx_and_jdata(retJidx, retJdata, num_tensors());
         const torch::Tensor retListIdx = mListIdx;
 
-        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retJdata, retJOffsets,
-                                                                      retJidx, retListIdx,
-                                                                      mNumOuterLists);
+        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+            retJdata, retJOffsets, retJidx, retListIdx, mNumOuterLists);
     } else {
         TORCH_CHECK_INDEX(false, "Unsupported indexing operation");
     }
 }
 
-JaggedTensor JaggedTensor::jreshape(const std::vector<int64_t>& lsizes) const {
+JaggedTensor
+JaggedTensor::jreshape(const std::vector<int64_t> &lsizes) const {
     return JaggedTensor(lsizes, mData);
 }
 
-JaggedTensor JaggedTensor::jreshape(const std::vector<std::vector<int64_t>>& lsizes) const {
+JaggedTensor
+JaggedTensor::jreshape(const std::vector<std::vector<int64_t>> &lsizes) const {
     return JaggedTensor(lsizes, num_tensors(), mData);
 }
 
-JaggedTensor JaggedTensor::jreshape_as(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::jreshape_as(const JaggedTensor &other) const {
     return other.jagged_like(mData);
 }
 
-JaggedTensor JaggedTensor::jflatten(const int64_t dim) const {
+JaggedTensor
+JaggedTensor::jflatten(const int64_t dim) const {
     int64_t jdim = dim;
     if (dim < 0) {
         jdim += ldim();
     }
     TORCH_CHECK_INDEX(jdim >= 0 && jdim < ldim(), "Invalid dimension to flatten");
 
-
     if (ldim() == 2) {
         if (jdim == 1) {
-            torch::Tensor newJIdx = mListIdx.index({torch::indexing::Slice(), 0}).index({mBatchIdx.to(torch::kInt)}).to(JIdxScalarType);
-            torch::Tensor newOffsets = joffsets_from_jidx_and_jdata(newJIdx, mData, num_outer_lists());
+            torch::Tensor newJIdx = mListIdx.index({ torch::indexing::Slice(), 0 })
+                                        .index({ mBatchIdx.to(torch::kInt) })
+                                        .to(JIdxScalarType);
+            torch::Tensor newOffsets =
+                joffsets_from_jidx_and_jdata(newJIdx, mData, num_outer_lists());
             return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-                mData, newOffsets, newJIdx, torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device())), newOffsets.size(0) - 1);
+                mData, newOffsets, newJIdx,
+                torch::empty({ 0, 1 },
+                             torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device())),
+                newOffsets.size(0) - 1);
         } else {
             return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-                mData, mOffsets, mBatchIdx, torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device())), mOffsets.size(0) - 1);
+                mData, mOffsets, mBatchIdx,
+                torch::empty({ 0, 1 },
+                             torch::TensorOptions().dtype(JLIdxScalarType).device(mData.device())),
+                mOffsets.size(0) - 1);
         }
     } else if (ldim() == 1) {
         return JaggedTensor(mData);
     } else {
-        TORCH_CHECK(false, "Unsupported list dimension. Currently JaggedTensor only supports up to 2.");
+        TORCH_CHECK(false,
+                    "Unsupported list dimension. Currently JaggedTensor only supports up to 2.");
     }
 }
 // JaggedTensor JaggedTensor::jagged_argsort() {
@@ -655,9 +783,11 @@ JaggedTensor JaggedTensor::jflatten(const int64_t dim) const {
 //     return jagged_like(argsortIdx);
 // }
 
-JaggedTensor JaggedTensor::jsum(int64_t dim, bool keepdim) const {
+JaggedTensor
+JaggedTensor::jsum(int64_t dim, bool keepdim) const {
     const int64_t jdim = mData.dim();
-    TORCH_CHECK_INDEX(dim >= -(jdim-1) && dim < jdim, "dim must be between ", -(jdim-1), " and ", jdim-1, " inclusive");
+    TORCH_CHECK_INDEX(dim >= -(jdim - 1) && dim < jdim, "dim must be between ", -(jdim - 1),
+                      " and ", jdim - 1, " inclusive");
     if (dim < 0) {
         dim += jdim;
     }
@@ -667,20 +797,26 @@ JaggedTensor JaggedTensor::jsum(int64_t dim, bool keepdim) const {
         if (mBatchIdx.size(0) == 0) {
             retData = mData.sum(0).unsqueeze(0);
         } else {
-            retData = detail::autograd::JaggedSum::apply(jdata(), jidx(), joffsets(), num_tensors())[0];
+            retData =
+                detail::autograd::JaggedSum::apply(jdata(), jidx(), joffsets(), num_tensors())[0];
         }
-        const torch::Tensor retOffsets = torch::arange(0, retData.size(0) + 1, torch::TensorOptions().dtype(JOffsetsScalarType).device(retData.device()));
+        const torch::Tensor retOffsets = torch::arange(
+            0, retData.size(0) + 1,
+            torch::TensorOptions().dtype(JOffsetsScalarType).device(retData.device()));
         const torch::Tensor retJidx = jidx_from_joffsets(retOffsets, retData.size(0));
 
-        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx, mListIdx, mNumOuterLists);
+        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx,
+                                                                      mListIdx, mNumOuterLists);
     } else {
         return jagged_like(mData.sum(dim, keepdim));
     }
 }
 
-std::vector<JaggedTensor> JaggedTensor::jmin(int64_t dim, bool keepdim) const {
+std::vector<JaggedTensor>
+JaggedTensor::jmin(int64_t dim, bool keepdim) const {
     const int64_t jdim = mData.dim();
-    TORCH_CHECK_INDEX(dim >= -(jdim-1) && dim <= jdim, "dim must be between ", -(jdim-1), " and ", jdim-1, " inclusive");
+    TORCH_CHECK_INDEX(dim >= -(jdim - 1) && dim <= jdim, "dim must be between ", -(jdim - 1),
+                      " and ", jdim - 1, " inclusive");
     if (dim < 0) {
         dim += jdim;
     }
@@ -689,31 +825,37 @@ std::vector<JaggedTensor> JaggedTensor::jmin(int64_t dim, bool keepdim) const {
         torch::Tensor minVals, minIndices;
         if (mBatchIdx.size(0) == 0) {
             auto minTuple = mData.min(0);
-            minVals = std::get<0>(minTuple).unsqueeze(0);
-            minIndices = std::get<1>(minTuple).unsqueeze(0);
-        } else  {
-            auto minTuple = detail::autograd::JaggedMin::apply(jdata(), jidx(), joffsets(), num_tensors());
-            minVals = minTuple[0];
+            minVals       = std::get<0>(minTuple).unsqueeze(0);
+            minIndices    = std::get<1>(minTuple).unsqueeze(0);
+        } else {
+            auto minTuple =
+                detail::autograd::JaggedMin::apply(jdata(), jidx(), joffsets(), num_tensors());
+            minVals    = minTuple[0];
             minIndices = minTuple[1];
         }
 
-        const torch::Tensor retOffsets = torch::arange(0, minVals.size(0) + 1, torch::TensorOptions().dtype(JOffsetsScalarType).device(minVals.device()));
+        const torch::Tensor retOffsets = torch::arange(
+            0, minVals.size(0) + 1,
+            torch::TensorOptions().dtype(JOffsetsScalarType).device(minVals.device()));
         const torch::Tensor retJidx = jidx_from_joffsets(retOffsets, minVals.size(0));
 
-        JaggedTensor retVals = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(minVals, retOffsets, retJidx, mListIdx, mNumOuterLists);
+        JaggedTensor retVals = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+            minVals, retOffsets, retJidx, mListIdx, mNumOuterLists);
         JaggedTensor retIdxs = retVals.jagged_like(minIndices);
         return { retVals, retIdxs };
     } else {
-        auto minTuple = mData.min(dim, keepdim);
-        torch::Tensor minVals = std::get<0>(minTuple);
+        auto          minTuple   = mData.min(dim, keepdim);
+        torch::Tensor minVals    = std::get<0>(minTuple);
         torch::Tensor minIndices = std::get<1>(minTuple);
         return { jagged_like(minVals), jagged_like(minIndices) };
     }
 }
 
-std::vector<JaggedTensor> JaggedTensor::jmax(int64_t dim, bool keepdim) const {
+std::vector<JaggedTensor>
+JaggedTensor::jmax(int64_t dim, bool keepdim) const {
     const int64_t jdim = mData.dim();
-    TORCH_CHECK_INDEX(dim >= -(jdim-1) && dim <= jdim, "dim must be between ", -(jdim-1), " and ", jdim-1, " inclusive");
+    TORCH_CHECK_INDEX(dim >= -(jdim - 1) && dim <= jdim, "dim must be between ", -(jdim - 1),
+                      " and ", jdim - 1, " inclusive");
     if (dim < 0) {
         dim += jdim;
     }
@@ -722,28 +864,33 @@ std::vector<JaggedTensor> JaggedTensor::jmax(int64_t dim, bool keepdim) const {
         torch::Tensor maxVals, maxIndices;
         if (mBatchIdx.size(0) == 0) {
             auto maxTuple = mData.max(0);
-            maxVals = std::get<0>(maxTuple).unsqueeze(0);
-            maxIndices = std::get<1>(maxTuple).unsqueeze(0);
-        } else  {
-            auto maxTuple = detail::autograd::JaggedMax::apply(jdata(), jidx(), joffsets(), num_tensors());
-            maxVals = maxTuple[0];
+            maxVals       = std::get<0>(maxTuple).unsqueeze(0);
+            maxIndices    = std::get<1>(maxTuple).unsqueeze(0);
+        } else {
+            auto maxTuple =
+                detail::autograd::JaggedMax::apply(jdata(), jidx(), joffsets(), num_tensors());
+            maxVals    = maxTuple[0];
             maxIndices = maxTuple[1];
         }
 
-        const torch::Tensor retOffsets = torch::arange(0, maxVals.size(0) + 1, torch::TensorOptions().dtype(JOffsetsScalarType).device(maxVals.device()));
+        const torch::Tensor retOffsets = torch::arange(
+            0, maxVals.size(0) + 1,
+            torch::TensorOptions().dtype(JOffsetsScalarType).device(maxVals.device()));
         const torch::Tensor retJidx = jidx_from_joffsets(retOffsets, maxVals.size(0));
-        JaggedTensor retVals = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(maxVals, retOffsets, retJidx, mListIdx, mNumOuterLists);
+        JaggedTensor        retVals = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+            maxVals, retOffsets, retJidx, mListIdx, mNumOuterLists);
         JaggedTensor retIdxs = retVals.jagged_like(maxIndices);
         return { retVals, retIdxs };
     } else {
-        auto maxTuple = mData.max(dim, keepdim);
-        torch::Tensor maxVals = std::get<0>(maxTuple);
+        auto          maxTuple   = mData.max(dim, keepdim);
+        torch::Tensor maxVals    = std::get<0>(maxTuple);
         torch::Tensor maxIndices = std::get<1>(maxTuple);
-        return {jagged_like(maxVals), jagged_like(maxIndices) };
+        return { jagged_like(maxVals), jagged_like(maxIndices) };
     }
 }
 
-JaggedTensor JaggedTensor::jcat(const std::vector<JaggedTensor>& vec, c10::optional<int64_t> dimension) {
+JaggedTensor
+JaggedTensor::jcat(const std::vector<JaggedTensor> &vec, c10::optional<int64_t> dimension) {
     // Null dimension is just list concatenation
     if (!dimension.has_value()) {
         TORCH_CHECK_VALUE(vec.size() > 0, "Empty jagged tensor list");
@@ -752,18 +899,24 @@ JaggedTensor JaggedTensor::jcat(const std::vector<JaggedTensor>& vec, c10::optio
         std::vector<torch::Tensor> data;
         std::vector<torch::Tensor> offsets;
         std::vector<torch::Tensor> lidx;
-        JOffsetsType curOffset = 0;
-        int64_t totalLists = 0;
-        torch::Tensor curListOffset = torch::zeros({1, vec[0].mListIdx.size(1)}, torch::TensorOptions().dtype(JLIdxScalarType).device(vec[0].mData.device()));
+        JOffsetsType               curOffset     = 0;
+        int64_t                    totalLists    = 0;
+        torch::Tensor              curListOffset = torch::zeros(
+            { 1, vec[0].mListIdx.size(1) },
+            torch::TensorOptions().dtype(JLIdxScalarType).device(vec[0].mData.device()));
         for (size_t i = 0; i < vec.size(); ++i) {
-            const auto& jvec = vec[i];
-            TORCH_CHECK_VALUE(jvec.mData.device() == vec[0].mData.device(), "All JaggedTensors must be on the same device");
-            TORCH_CHECK_VALUE(jvec.mListIdx.size(1) == vec[0].mListIdx.size(1), "All JaggedTensors must have the same list dimension");
-            TORCH_CHECK_VALUE(jvec.scalar_type() == vec[0].scalar_type(), "All JaggedTensors must have the same scalar type");
+            const auto &jvec = vec[i];
+            TORCH_CHECK_VALUE(jvec.mData.device() == vec[0].mData.device(),
+                              "All JaggedTensors must be on the same device");
+            TORCH_CHECK_VALUE(jvec.mListIdx.size(1) == vec[0].mListIdx.size(1),
+                              "All JaggedTensors must have the same list dimension");
+            TORCH_CHECK_VALUE(jvec.scalar_type() == vec[0].scalar_type(),
+                              "All JaggedTensors must have the same scalar type");
 
             data.push_back(jvec.mData);
             if (i < vec.size() - 1) {
-                offsets.push_back(jvec.mOffsets.index({torch::indexing::Slice(0, -1)}) + curOffset);
+                offsets.push_back(jvec.mOffsets.index({ torch::indexing::Slice(0, -1) }) +
+                                  curOffset);
             } else {
                 offsets.push_back(jvec.mOffsets + curOffset);
             }
@@ -772,408 +925,501 @@ JaggedTensor JaggedTensor::jcat(const std::vector<JaggedTensor>& vec, c10::optio
             curListOffset[0][0] += jvec.mNumOuterLists;
             totalLists += jvec.mNumOuterLists;
         }
-        const torch::Tensor retJData = torch::cat(data, 0);
+        const torch::Tensor retJData    = torch::cat(data, 0);
         const torch::Tensor retJOffsets = torch::cat(offsets, 0);
-        const torch::Tensor retJidx = jidx_from_joffsets(retJOffsets, retJData.size(0));
-        const torch::Tensor retLidx = torch::cat(lidx, 0);
-        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retJData, retJOffsets, retJidx, retLidx, totalLists);
+        const torch::Tensor retJidx     = jidx_from_joffsets(retJOffsets, retJData.size(0));
+        const torch::Tensor retLidx     = torch::cat(lidx, 0);
+        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retJData, retJOffsets,
+                                                                      retJidx, retLidx, totalLists);
     } else {
         int64_t dim = dimension.value();
         TORCH_CHECK_VALUE(vec.size() > 0, "empty tensor list");
         const int64_t jdim = vec[0].mData.dim();
-        TORCH_CHECK_INDEX(dim >= -(jdim-1) && dim <= jdim, "dim must be between ", -(jdim-1), " and ", jdim-1, " inclusive");
+        TORCH_CHECK_INDEX(dim >= -(jdim - 1) && dim <= jdim, "dim must be between ", -(jdim - 1),
+                          " and ", jdim - 1, " inclusive");
         if (dim < 0) {
             dim += jdim;
         }
 
         if (dim == 0) {
-            return FVDB_DISPATCH_KERNEL_DEVICE(vec[0].device(), [&]() {
-                return detail::ops::dispatchJCat0<DeviceTag>(vec);
-            });
+            return FVDB_DISPATCH_KERNEL_DEVICE(
+                vec[0].device(), [&]() { return detail::ops::dispatchJCat0<DeviceTag>(vec); });
         } else {
             std::vector<torch::Tensor> data;
-            for (const auto& jvec : vec) { data.push_back(jvec.mData); }
-            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(torch::cat(data, dim), vec[0].mOffsets, vec[0].mBatchIdx, vec[0].mListIdx, vec[0].mNumOuterLists);
+            for (const auto &jvec: vec) {
+                data.push_back(jvec.mData);
+            }
+            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+                torch::cat(data, dim), vec[0].mOffsets, vec[0].mBatchIdx, vec[0].mListIdx,
+                vec[0].mNumOuterLists);
         }
     }
-
 }
 
-
-JaggedTensor JaggedTensor::sqrt() const {
+JaggedTensor
+JaggedTensor::sqrt() const {
     return jagged_like(torch::sqrt(mData));
 }
-JaggedTensor JaggedTensor::abs() const {
+JaggedTensor
+JaggedTensor::abs() const {
     return jagged_like(torch::abs(mData));
 }
 
-JaggedTensor JaggedTensor::floor() const {
+JaggedTensor
+JaggedTensor::floor() const {
     return jagged_like(torch::floor(mData));
 }
 
-JaggedTensor JaggedTensor::ceil() const {
+JaggedTensor
+JaggedTensor::ceil() const {
     return jagged_like(torch::ceil(mData));
 }
 
-JaggedTensor JaggedTensor::round(int decimals) const {
+JaggedTensor
+JaggedTensor::round(int decimals) const {
     return jagged_like(torch::round(mData, decimals));
 }
 
-
-JaggedTensor& JaggedTensor::sqrt_() {
+JaggedTensor &
+JaggedTensor::sqrt_() {
     mData.sqrt_();
     return *this;
 }
-JaggedTensor& JaggedTensor::abs_() {
+JaggedTensor &
+JaggedTensor::abs_() {
     mData.abs_();
     return *this;
 }
 
-JaggedTensor& JaggedTensor::floor_() {
+JaggedTensor &
+JaggedTensor::floor_() {
     mData.floor_();
     return *this;
 }
 
-JaggedTensor& JaggedTensor::ceil_() {
+JaggedTensor &
+JaggedTensor::ceil_() {
     mData.ceil_();
     return *this;
 }
 
-JaggedTensor& JaggedTensor::round_(int decimals) {
+JaggedTensor &
+JaggedTensor::round_(int decimals) {
     mData.round_(decimals);
     return *this;
 }
 
-
-
-const JaggedTensor& JaggedTensor::set_requires_grad(bool requires_grad) const {
+const JaggedTensor &
+JaggedTensor::set_requires_grad(bool requires_grad) const {
     mData.set_requires_grad(requires_grad);
     return *this;
 }
 
-bool JaggedTensor::requires_grad() const {
+bool
+JaggedTensor::requires_grad() const {
     return mData.requires_grad();
 }
 
-JaggedTensor JaggedTensor::detach() const {
+JaggedTensor
+JaggedTensor::detach() const {
     return jagged_like(mData.detach());
 }
 
-JaggedTensor JaggedTensor::clone() const {
+JaggedTensor
+JaggedTensor::clone() const {
     return jagged_like(mData.clone());
 }
 
-
-JaggedTensor JaggedTensor::operator+(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator+(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData + other.mData);
 }
-JaggedTensor JaggedTensor::operator+(const int other) const {
+JaggedTensor
+JaggedTensor::operator+(const int other) const {
     return jagged_like(mData + other);
 }
-JaggedTensor JaggedTensor::operator+(const float other) const {
+JaggedTensor
+JaggedTensor::operator+(const float other) const {
     return jagged_like(mData + other);
 }
-JaggedTensor JaggedTensor::operator+(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator+(const torch::Tensor &other) const {
     return jagged_like(mData + other);
 }
 
-JaggedTensor& JaggedTensor::operator+=(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::operator+=(const JaggedTensor &other) {
     binary_op_check(other);
     mData += other.mData;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator+=(const int other) {
+JaggedTensor &
+JaggedTensor::operator+=(const int other) {
     mData += other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator+=(const float other) {
+JaggedTensor &
+JaggedTensor::operator+=(const float other) {
     mData += other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator+=(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::operator+=(const torch::Tensor &other) {
     mData += other;
     return *this;
 }
 
-JaggedTensor JaggedTensor::operator-(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator-(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData - other.mData);
 }
-JaggedTensor JaggedTensor::operator-(const int other) const {
+JaggedTensor
+JaggedTensor::operator-(const int other) const {
     return jagged_like(mData - other);
 }
-JaggedTensor JaggedTensor::operator-(const float other) const {
+JaggedTensor
+JaggedTensor::operator-(const float other) const {
     return jagged_like(mData - other);
 }
-JaggedTensor JaggedTensor::operator-(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator-(const torch::Tensor &other) const {
     return jagged_like(mData - other);
 }
 
-JaggedTensor JaggedTensor::operator-() const {
+JaggedTensor
+JaggedTensor::operator-() const {
     return jagged_like(-mData);
 }
 
-JaggedTensor& JaggedTensor::operator-=(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::operator-=(const JaggedTensor &other) {
     binary_op_check(other);
     mData -= other.mData;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator-=(const int other) {
+JaggedTensor &
+JaggedTensor::operator-=(const int other) {
     mData -= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator-=(const float other) {
+JaggedTensor &
+JaggedTensor::operator-=(const float other) {
     mData -= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator-=(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::operator-=(const torch::Tensor &other) {
     mData -= other;
     return *this;
 }
 
-JaggedTensor JaggedTensor::operator*(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator*(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData * other.mData);
 }
-JaggedTensor JaggedTensor::operator*(const int other) const {
+JaggedTensor
+JaggedTensor::operator*(const int other) const {
     return jagged_like(mData * other);
 }
-JaggedTensor JaggedTensor::operator*(const float other) const {
+JaggedTensor
+JaggedTensor::operator*(const float other) const {
     return jagged_like(mData * other);
 }
-JaggedTensor JaggedTensor::operator*(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator*(const torch::Tensor &other) const {
     return jagged_like(mData * other);
 }
 
-JaggedTensor& JaggedTensor::operator*=(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::operator*=(const JaggedTensor &other) {
     binary_op_check(other);
     mData *= other.mData;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator*=(const int other) {
+JaggedTensor &
+JaggedTensor::operator*=(const int other) {
     mData *= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator*=(const float other) {
+JaggedTensor &
+JaggedTensor::operator*=(const float other) {
     mData *= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator*=(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::operator*=(const torch::Tensor &other) {
     mData *= other;
     return *this;
 }
 
-JaggedTensor JaggedTensor::operator/(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator/(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData / other.mData);
 }
-JaggedTensor JaggedTensor::operator/(const int other) const {
+JaggedTensor
+JaggedTensor::operator/(const int other) const {
     return jagged_like(mData / other);
 }
-JaggedTensor JaggedTensor::operator/(const float other) const {
+JaggedTensor
+JaggedTensor::operator/(const float other) const {
     return jagged_like(mData / other);
 }
-JaggedTensor JaggedTensor::operator/(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator/(const torch::Tensor &other) const {
     return jagged_like(mData / other);
 }
 
-JaggedTensor& JaggedTensor::operator/=(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::operator/=(const JaggedTensor &other) {
     binary_op_check(other);
     mData /= other.mData;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator/=(const int other) {
+JaggedTensor &
+JaggedTensor::operator/=(const int other) {
     mData /= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator/=(const float other) {
+JaggedTensor &
+JaggedTensor::operator/=(const float other) {
     mData /= other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator/=(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::operator/=(const torch::Tensor &other) {
     mData /= other;
     return *this;
 }
 
-JaggedTensor JaggedTensor::floordiv(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::floordiv(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(torch::floor_divide(mData, other.mData));
 }
-JaggedTensor JaggedTensor::floordiv(const int other) const {
+JaggedTensor
+JaggedTensor::floordiv(const int other) const {
     return jagged_like(torch::floor_divide(mData, other));
 }
-JaggedTensor JaggedTensor::floordiv(const float other) const {
+JaggedTensor
+JaggedTensor::floordiv(const float other) const {
     return jagged_like(torch::floor_divide(mData, other));
 }
-JaggedTensor JaggedTensor::floordiv(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::floordiv(const torch::Tensor &other) const {
     return jagged_like(torch::floor_divide(mData, other));
 }
 
-JaggedTensor& JaggedTensor::floordiveq(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::floordiveq(const JaggedTensor &other) {
     binary_op_check(other);
     mData.floor_divide_(other.mData);
     return *this;
 }
-JaggedTensor& JaggedTensor::floordiveq(const int other) {
+JaggedTensor &
+JaggedTensor::floordiveq(const int other) {
     mData = torch::floor_divide(mData, other);
     return *this;
 }
-JaggedTensor& JaggedTensor::floordiveq(const float other) {
+JaggedTensor &
+JaggedTensor::floordiveq(const float other) {
     mData = torch::floor_divide(mData, other);
     return *this;
 }
-JaggedTensor& JaggedTensor::floordiveq(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::floordiveq(const torch::Tensor &other) {
     mData.floor_divide_(other);
     return *this;
 }
 
-JaggedTensor JaggedTensor::operator%(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator%(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData % other.mData);
 }
-JaggedTensor JaggedTensor::operator%(const int other) const {
+JaggedTensor
+JaggedTensor::operator%(const int other) const {
     return jagged_like(mData % other);
 }
-JaggedTensor JaggedTensor::operator%(const float other) const {
+JaggedTensor
+JaggedTensor::operator%(const float other) const {
     return jagged_like(mData % other);
 }
-JaggedTensor JaggedTensor::operator%(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator%(const torch::Tensor &other) const {
     return jagged_like(mData % other);
 }
 
-JaggedTensor& JaggedTensor::operator%=(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::operator%=(const JaggedTensor &other) {
     binary_op_check(other);
     mData = mData % other.mData;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator%=(const int other) {
+JaggedTensor &
+JaggedTensor::operator%=(const int other) {
     mData = mData % other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator%=(const float other) {
+JaggedTensor &
+JaggedTensor::operator%=(const float other) {
     mData = mData % other;
     return *this;
 }
-JaggedTensor& JaggedTensor::operator%=(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::operator%=(const torch::Tensor &other) {
     mData = mData % other;
     return *this;
 }
 
-JaggedTensor JaggedTensor::pow(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::pow(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData.pow(other.mData));
 }
-JaggedTensor JaggedTensor::pow(const int other) const {
+JaggedTensor
+JaggedTensor::pow(const int other) const {
     return jagged_like(mData.pow(other));
 }
-JaggedTensor JaggedTensor::pow(const float other) const {
+JaggedTensor
+JaggedTensor::pow(const float other) const {
     return jagged_like(mData.pow(other));
 }
-JaggedTensor JaggedTensor::pow(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::pow(const torch::Tensor &other) const {
     return jagged_like(mData.pow(other));
 }
 
-JaggedTensor& JaggedTensor::poweq(const JaggedTensor& other) {
+JaggedTensor &
+JaggedTensor::poweq(const JaggedTensor &other) {
     binary_op_check(other);
     mData.pow_(other.mData);
     return *this;
 }
-JaggedTensor& JaggedTensor::poweq(const int other) {
+JaggedTensor &
+JaggedTensor::poweq(const int other) {
     mData = mData.pow(other);
     return *this;
 }
-JaggedTensor& JaggedTensor::poweq(const float other) {
+JaggedTensor &
+JaggedTensor::poweq(const float other) {
     mData = mData.pow(other);
     return *this;
 }
-JaggedTensor& JaggedTensor::poweq(const torch::Tensor& other) {
+JaggedTensor &
+JaggedTensor::poweq(const torch::Tensor &other) {
     mData.pow_(other);
     return *this;
 }
 
-
-JaggedTensor JaggedTensor::operator>(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator>(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData > other.mData);
 }
-JaggedTensor JaggedTensor::operator>(const int other) const {
+JaggedTensor
+JaggedTensor::operator>(const int other) const {
     return jagged_like(mData > other);
 }
-JaggedTensor JaggedTensor::operator>(const float other) const {
+JaggedTensor
+JaggedTensor::operator>(const float other) const {
     return jagged_like(mData > other);
 }
-JaggedTensor JaggedTensor::operator>(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator>(const torch::Tensor &other) const {
     return jagged_like(mData > other);
 }
 
-JaggedTensor JaggedTensor::operator>=(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator>=(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData >= other.mData);
 }
-JaggedTensor JaggedTensor::operator>=(const int other) const {
+JaggedTensor
+JaggedTensor::operator>=(const int other) const {
     return jagged_like(mData >= other);
 }
-JaggedTensor JaggedTensor::operator>=(const float other) const {
+JaggedTensor
+JaggedTensor::operator>=(const float other) const {
     return jagged_like(mData >= other);
 }
-JaggedTensor JaggedTensor::operator>=(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator>=(const torch::Tensor &other) const {
     return jagged_like(mData >= other);
 }
 
-JaggedTensor JaggedTensor::operator<(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator<(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData < other.mData);
 }
-JaggedTensor JaggedTensor::operator<(const int other) const {
+JaggedTensor
+JaggedTensor::operator<(const int other) const {
     return jagged_like(mData < other);
 }
-JaggedTensor JaggedTensor::operator<(const float other) const {
+JaggedTensor
+JaggedTensor::operator<(const float other) const {
     return jagged_like(mData < other);
 }
-JaggedTensor JaggedTensor::operator<(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator<(const torch::Tensor &other) const {
     return jagged_like(mData < other);
 }
 
-JaggedTensor JaggedTensor::operator<=(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator<=(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData <= other.mData);
 }
-JaggedTensor JaggedTensor::operator<=(const int other) const {
+JaggedTensor
+JaggedTensor::operator<=(const int other) const {
     return jagged_like(mData <= other);
 }
-JaggedTensor JaggedTensor::operator<=(const float other) const {
+JaggedTensor
+JaggedTensor::operator<=(const float other) const {
     return jagged_like(mData <= other);
 }
-JaggedTensor JaggedTensor::operator<=(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator<=(const torch::Tensor &other) const {
     return jagged_like(mData <= other);
 }
 
-JaggedTensor JaggedTensor::operator==(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator==(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData == other.mData);
 }
-JaggedTensor JaggedTensor::operator==(const int other) const {
+JaggedTensor
+JaggedTensor::operator==(const int other) const {
     return jagged_like(mData == other);
 }
-JaggedTensor JaggedTensor::operator==(const float other) const {
+JaggedTensor
+JaggedTensor::operator==(const float other) const {
     return jagged_like(mData == other);
 }
-JaggedTensor JaggedTensor::operator==(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator==(const torch::Tensor &other) const {
     return jagged_like(mData == other);
 }
 
-JaggedTensor JaggedTensor::operator!=(const JaggedTensor& other) const {
+JaggedTensor
+JaggedTensor::operator!=(const JaggedTensor &other) const {
     binary_op_check(other);
     return jagged_like(mData != other.mData);
 }
-JaggedTensor JaggedTensor::operator!=(const int other) const {
+JaggedTensor
+JaggedTensor::operator!=(const int other) const {
     return jagged_like(mData != other);
 }
-JaggedTensor JaggedTensor::operator!=(const float other) const {
+JaggedTensor
+JaggedTensor::operator!=(const float other) const {
     return jagged_like(mData != other);
 }
-JaggedTensor JaggedTensor::operator!=(const torch::Tensor& other) const {
+JaggedTensor
+JaggedTensor::operator!=(const torch::Tensor &other) const {
     return jagged_like(mData != other);
 }
 
diff --git a/fvdb/src/JaggedTensor.h b/fvdb/src/JaggedTensor.h
index eb94ed875d..6875671d8c 100644
--- a/fvdb/src/JaggedTensor.h
+++ b/fvdb/src/JaggedTensor.h
@@ -1,125 +1,136 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_JAGGEDTENSOR_H
+#define FVDB_JAGGEDTENSOR_H
 
+#include "detail/utils/Utils.h"
+
+#include <torch/all.h>
 #include <torch/custom_class.h>
 #include <torch/extension.h>
-#include <torch/all.h>
-
-#include "detail/utils/Utils.h"
 
 namespace fvdb {
 
 struct JaggedTensorIndex;
 
-using JIdxType = int32_t;
+using JIdxType     = int32_t;
 using JOffsetsType = int64_t;
-using JLIdxType = int32_t;
+using JLIdxType    = int32_t;
 
-constexpr c10::ScalarType JIdxScalarType = c10::CppTypeToScalarType<JIdxType>::value;
+constexpr c10::ScalarType JIdxScalarType     = c10::CppTypeToScalarType<JIdxType>::value;
 constexpr c10::ScalarType JOffsetsScalarType = c10::CppTypeToScalarType<JOffsetsType>::value;
-constexpr c10::ScalarType JLIdxScalarType = c10::CppTypeToScalarType<JLIdxType>::value;
+constexpr c10::ScalarType JLIdxScalarType    = c10::CppTypeToScalarType<JLIdxType>::value;
 
-template <typename ScalarT, size_t NDims>
-class JaggedAccessor {
-    torch::TensorAccessor<JIdxType, 1> mBatchIdx;
+template <typename ScalarT, size_t NDims> class JaggedAccessor {
+    torch::TensorAccessor<JIdxType, 1>     mBatchIdx;
     torch::TensorAccessor<JOffsetsType, 1> mOffsets;
-    torch::TensorAccessor<JLIdxType, 2> mListIndexes;
-    torch::TensorAccessor<ScalarT, NDims> mData;
+    torch::TensorAccessor<JLIdxType, 2>    mListIndexes;
+    torch::TensorAccessor<ScalarT, NDims>  mData;
 
     friend class JaggedTensor;
 
-    JaggedAccessor(torch::TensorAccessor<JIdxType, 1> batchIdx,
+    JaggedAccessor(torch::TensorAccessor<JIdxType, 1>     batchIdx,
                    torch::TensorAccessor<JOffsetsType, 1> offsets,
-                   torch::TensorAccessor<JLIdxType, 2> listIndexes,
-                   torch::TensorAccessor<ScalarT, NDims> data)
+                   torch::TensorAccessor<JLIdxType, 2>    listIndexes,
+                   torch::TensorAccessor<ScalarT, NDims>  data)
         : mBatchIdx(batchIdx), mOffsets(offsets), mListIndexes(listIndexes), mData(data) {}
-public:
 
-    template <typename T, size_t N>
-    using TensorAccessorType = torch::TensorAccessor<T, N>;
+  public:
+    template <typename T, size_t N> using TensorAccessorType = torch::TensorAccessor<T, N>;
 
-    inline __hostdev__ int64_t elementCount() const {
+    inline __hostdev__ int64_t
+    elementCount() const {
         return mData.size(0);
     }
 
-    inline __hostdev__ JIdxType batchIdx(int64_t idx) const {
+    inline __hostdev__ JIdxType
+    batchIdx(int64_t idx) const {
         return mBatchIdx.size(0) > 0 ? mBatchIdx[idx] : 0;
     }
 
-    inline __hostdev__ JOffsetsType offsetStart(int64_t idx) const {
+    inline __hostdev__ JOffsetsType
+    offsetStart(int64_t idx) const {
         return mOffsets[idx];
     }
 
-    inline __hostdev__ JOffsetsType offsetEnd(int64_t idx) const {
-        return mOffsets[idx+1];
+    inline __hostdev__ JOffsetsType
+    offsetEnd(int64_t idx) const {
+        return mOffsets[idx + 1];
     }
 
-    inline __hostdev__ const torch::TensorAccessor<ScalarT, NDims>& data() const {
+    inline __hostdev__ const torch::TensorAccessor<ScalarT, NDims>                          &
+    data() const {
         return mData;
     }
 };
 
-
-template <typename ScalarT, size_t NDims, template <typename U> typename PtrTraits = torch::DefaultPtrTraits>
+template <typename ScalarT, size_t NDims,
+          template <typename U> typename PtrTraits = torch::DefaultPtrTraits>
 class PackedJaggedAccessor32 {
-    torch::PackedTensorAccessor32<JIdxType, 1, PtrTraits> mBatchIdx;
+    torch::PackedTensorAccessor32<JIdxType, 1, PtrTraits>     mBatchIdx;
     torch::PackedTensorAccessor32<JOffsetsType, 1, PtrTraits> mOffsets;
-    torch::PackedTensorAccessor32<JLIdxType, 2, PtrTraits> mListIndexes;
-    torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits> mData;
+    torch::PackedTensorAccessor32<JLIdxType, 2, PtrTraits>    mListIndexes;
+    torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits>  mData;
 
     friend class JaggedTensor;
 
-    PackedJaggedAccessor32(torch::PackedTensorAccessor32<JIdxType, 1, PtrTraits> batchIdx,
+    PackedJaggedAccessor32(torch::PackedTensorAccessor32<JIdxType, 1, PtrTraits>     batchIdx,
                            torch::PackedTensorAccessor32<JOffsetsType, 1, PtrTraits> offsets,
-                           torch::PackedTensorAccessor32<JLIdxType, 2, PtrTraits> listIndexes,
-                           torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits> data)
+                           torch::PackedTensorAccessor32<JLIdxType, 2, PtrTraits>    listIndexes,
+                           torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits>  data)
         : mBatchIdx(batchIdx), mOffsets(offsets), mListIndexes(listIndexes), mData(data) {}
 
-public:
-
+  public:
     template <typename T, size_t N>
     using TensorAccessorType = torch::TensorAccessor<T, N, PtrTraits>;
 
-    inline __hostdev__ int64_t elementCount() const {
+    inline __hostdev__ int64_t
+    elementCount() const {
         return mData.size(0);
     }
 
-    inline __hostdev__ JIdxType batchIdx(int64_t idx) const {
+    inline __hostdev__ JIdxType
+    batchIdx(int64_t idx) const {
         return mBatchIdx.size(0) > 0 ? mBatchIdx[idx] : 0;
     }
 
-    inline __hostdev__ JOffsetsType offsetStart(int64_t idx) const {
+    inline __hostdev__ JOffsetsType
+    offsetStart(int64_t idx) const {
         return mOffsets[idx];
     }
 
-    inline __hostdev__ JOffsetsType offsetEnd(int64_t idx) const {
-        return mOffsets[idx+1];
+    inline __hostdev__ JOffsetsType
+    offsetEnd(int64_t idx) const {
+        return mOffsets[idx + 1];
     }
 
-    inline __hostdev__ const torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits>& data() const {
+    inline __hostdev__ const torch::PackedTensorAccessor32<ScalarT, NDims, PtrTraits>                          &
+    data() const {
         return mData;
     }
 };
 
-
 class JaggedTensor : public torch::CustomClassHolder {
-    torch::Tensor mData;        // Actual data indexed by a jagged tensor
-    torch::Tensor mBatchIdx;    // Which (linear) batch is each datum in
-    torch::Tensor mOffsets;     // Offset of each tensor in the list of lists
-    torch::Tensor mListIdx;     // LoL indexing of tensor with shape [num_tensors, ldim]
-    int64_t mNumOuterLists;     // Number of outer lists in this JaggedTensor
+    torch::Tensor mData;          // Actual data indexed by a jagged tensor
+    torch::Tensor mBatchIdx;      // Which (linear) batch is each datum in
+    torch::Tensor mOffsets;       // Offset of each tensor in the list of lists
+    torch::Tensor mListIdx;       // LoL indexing of tensor with shape [num_tensors, ldim]
+    int64_t       mNumOuterLists; // Number of outer lists in this JaggedTensor
 
     // Store the number of elements in each tensor in the jagged tensor
     // Computing this requires a GPU -> CPU copy so we cache it
     struct {
-        std::vector<int64_t> mLShape1;
-        std::vector<std::vector<int64_t>> mLShape2;
+        std::vector<int64_t>                           mLShape1;
+        std::vector<std::vector<int64_t>>              mLShape2;
         std::vector<std::vector<std::vector<int64_t>>> mLShape3;
-        bool mDirty = true;
-        void markDirty() { mDirty = true; }
-        void clear() {
+        bool                                           mDirty = true;
+        void
+        markDirty() {
+            mDirty = true;
+        }
+        void
+        clear() {
             mLShape1.clear();
             mLShape2.clear();
             mLShape3.clear();
@@ -129,25 +140,30 @@ class JaggedTensor : public torch::CustomClassHolder {
 
     void recompute_lsizes_if_dirty();
 
+    void binary_op_check(const JaggedTensor &other) const;
 
-    void binary_op_check(const JaggedTensor& other) const;
-
-public:
-    static torch::Tensor joffsets_from_jidx_and_jdata(torch::Tensor jidx, torch::Tensor jdata, int64_t num_tensors);
+  public:
+    static torch::Tensor joffsets_from_jidx_and_jdata(torch::Tensor jidx, torch::Tensor jdata,
+                                                      int64_t num_tensors);
     static torch::Tensor jidx_from_joffsets(torch::Tensor joffsets, int64_t num_elements);
-    static JaggedTensor from_jdata_joffsets_jidx_and_lidx_unsafe(torch::Tensor jdata, torch::Tensor joffsets,
-                                                                 torch::Tensor jidx, torch::Tensor jlidx,
-                                                                 int64_t numOuterLists);
+    static JaggedTensor  from_jdata_joffsets_jidx_and_lidx_unsafe(torch::Tensor jdata,
+                                                                  torch::Tensor joffsets,
+                                                                  torch::Tensor jidx,
+                                                                  torch::Tensor jlidx,
+                                                                  int64_t       numOuterLists);
 
-    static JaggedTensor from_data_indices_and_list_ids(torch::Tensor data, torch::Tensor indices, torch::Tensor list_ids, int64_t num_tensors);
-    static JaggedTensor from_data_offsets_and_list_ids(torch::Tensor data, torch::Tensor offsets, torch::Tensor list_ids);
+    static JaggedTensor from_data_indices_and_list_ids(torch::Tensor data, torch::Tensor indices,
+                                                       torch::Tensor list_ids, int64_t num_tensors);
+    static JaggedTensor from_data_offsets_and_list_ids(torch::Tensor data, torch::Tensor offsets,
+                                                       torch::Tensor list_ids);
 
     /// @brief Concatenate the list of JaggedTensors along a given dimension.
     ///        There are two modes for this function.
     ///        1. If dim is an integer:
     ///            e.g. if [jt_a, jt_b] are two JaggedTensors of the form
-    ///            jt_a = [[a_11, a_12], [a_21], [a_31, a_32]] and jt_b = [[b_11, b_12], [b_21], [b_31, b_32]],
-    ///            then JaggedTensor::jcat({jt_a, jt_b}) will return a JaggedTensor of the form
+    ///            jt_a = [[a_11, a_12], [a_21], [a_31, a_32]] and jt_b = [[b_11, b_12], [b_21],
+    ///            [b_31, b_32]], then JaggedTensor::jcat({jt_a, jt_b}) will return a JaggedTensor
+    ///            of the form
     ///            [[torch.cat([a_11, b_11], dim=dim), torch.cat([a_12, b_12], dim=dim)],
     ///             [torch.cat([a_21, b_21], dim=dim)],
     ///             [torch.cat([a_31, b_31], dim=dim), torch.cat([a_32, b_32], dim=dim)]]
@@ -157,176 +173,229 @@ class JaggedTensor : public torch::CustomClassHolder {
     ///            then JaggedTensor::jcat({jt_a, jt_b}) will return a JaggedTensor of the form
     ///            [[a_11, a_12], [a_21], [a_31, a_32], [b_11], [b_21, b_22]]
     /// @param vec A vector of JaggedTensors to concatenate
-    /// @param dim The dimension along which to concatenate each JaggedTensor or c10::nullopt to concatenate
+    /// @param dim The dimension along which to concatenate each JaggedTensor or c10::nullopt to
+    /// concatenate
     ///            the JaggedTensors as lists
     /// @return A JaggedTensor containing the concatenated data
-    static JaggedTensor jcat(const std::vector<JaggedTensor>& vec, c10::optional<int64_t> dim);
+    static JaggedTensor jcat(const std::vector<JaggedTensor> &vec, c10::optional<int64_t> dim);
 
     /// @brief Create an empty JaggedTensor
     JaggedTensor() {
-      mData = torch::Tensor();
-      mBatchIdx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType));
-      mOffsets = torch::zeros({1}, torch::TensorOptions().dtype(JOffsetsScalarType));
-      mListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType));
-      mNumOuterLists = 0;
+        mData          = torch::Tensor();
+        mBatchIdx      = torch::empty({ 0 }, torch::TensorOptions().dtype(JIdxScalarType));
+        mOffsets       = torch::zeros({ 1 }, torch::TensorOptions().dtype(JOffsetsScalarType));
+        mListIdx       = torch::empty({ 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType));
+        mNumOuterLists = 0;
     }
 
-    /// @brief Create a JaggedTensor representing a list with a single tensor. Note this function does not copy the
+    /// @brief Create a JaggedTensor representing a list with a single tensor. Note this function
+    /// does not copy the
     ///        data tensor, it only creates a view of it.
     /// @param data The data tensor
     JaggedTensor(torch::Tensor data);
 
     /// @brief Create a JaggedTensor representing a list of tensors.
     /// @param tensors A list of tensors
-    JaggedTensor(const std::vector<torch::Tensor>& tensors);
+    JaggedTensor(const std::vector<torch::Tensor> &tensors);
 
     /// @brief Create a JaggedTensor representing a list of lists of tensors.
     /// @param tensors A list of lists of tensors
-    JaggedTensor(const std::vector<std::vector<torch::Tensor>>& tensors);
-
-    /// @brief Create a JaggedTensor representing a list of tensors where the number of elements in each tensor is given
-    ///        by the lsizes vector. i.e. if lsizes = [2, 1, 2], then the first tensor will have 2 elements, the second
-    ///        tensor will have 1 element, and the third tensor will have 2 elements. The raw data tensor must then have
-    ///        a number of elements equal to the sum of the elements in lsizes (i.e. shape [sum(lsizes), ...])
+    JaggedTensor(const std::vector<std::vector<torch::Tensor>> &tensors);
+
+    /// @brief Create a JaggedTensor representing a list of tensors where the number of elements in
+    /// each tensor is given
+    ///        by the lsizes vector. i.e. if lsizes = [2, 1, 2], then the first tensor will have 2
+    ///        elements, the second tensor will have 1 element, and the third tensor will have 2
+    ///        elements. The raw data tensor must then have a number of elements equal to the sum of
+    ///        the elements in lsizes (i.e. shape [sum(lsizes), ...])
     /// @param lsizes A vector of integers indicating the number of elements in each tensor
     /// @param data The raw data tensor
-    JaggedTensor(const std::vector<int64_t>& lsizes, const torch::Tensor data);
-
-    /// @brief Create a JaggedTensor representing a list of lists of tensors where the number of elements in each tensor
-    ///       is given by the lsizes vector. i.e. if lsizes = [[2, 1], [5, 6, 7]], then the first list will have 2 tensors with 1 and 2 elements
-    ///       respectively and the second list will have 3 tensors with 5, 6, and 7 elements respectively.
-    ///       The raw data tensor must then have a number of elements equal to the sum of the elements in lsizes (i.e. shape [sum(lsizes), ...])
-    /// @param lsizes A vector of vectors of integers indicating the number of elements in each tensor
+    JaggedTensor(const std::vector<int64_t> &lsizes, const torch::Tensor data);
+
+    /// @brief Create a JaggedTensor representing a list of lists of tensors where the number of
+    /// elements in each tensor
+    ///       is given by the lsizes vector. i.e. if lsizes = [[2, 1], [5, 6, 7]], then the first
+    ///       list will have 2 tensors with 1 and 2 elements respectively and the second list will
+    ///       have 3 tensors with 5, 6, and 7 elements respectively. The raw data tensor must then
+    ///       have a number of elements equal to the sum of the elements in lsizes (i.e. shape
+    ///       [sum(lsizes), ...])
+    /// @param lsizes A vector of vectors of integers indicating the number of elements in each
+    /// tensor
     /// @param total_tensors The total number of tensors in the list of lists
     /// @param data The raw data tensor
-    JaggedTensor(const std::vector<std::vector<int64_t>>& lsizes, const int64_t total_tensors, const torch::Tensor data);
+    JaggedTensor(const std::vector<std::vector<int64_t>> &lsizes, const int64_t total_tensors,
+                 const torch::Tensor data);
 
-    /// @brief Create a JaggedTensor with the same list structure as this one but with the given raw data.
-    ///        The returned JaggedTensor will share the same memory for indices/list ids/offsets as this one
-    ///        those are modified.
+    /// @brief Create a JaggedTensor with the same list structure as this one but with the given raw
+    /// data.
+    ///        The returned JaggedTensor will share the same memory for indices/list ids/offsets as
+    ///        this one those are modified.
     /// @param data A tensor with the same number of elements as the original data
     /// @return A JaggedTensor with the same list structure as this one but with the given data
     JaggedTensor jagged_like(torch::Tensor data) const;
 
     /// @brief Set the raw data of this JaggedTensor to the given tensor
     /// @param data A data tensor with the same number of elements as the original data
-    void set_data(const torch::Tensor& data);
+    void set_data(const torch::Tensor &data);
 
     /// @brief  Get the raw data indexed by this JaggedTensor
     /// @return The raw data tensor
-    const torch::Tensor& jdata() const { return mData; }
+    const torch::Tensor &
+    jdata() const {
+        return mData;
+    }
 
-    /// @brief Get the indices of this jagged tensor. i.e. a tensor of size (num_elements,) indicating which
+    /// @brief Get the indices of this jagged tensor. i.e. a tensor of size (num_elements,)
+    /// indicating which
     ///        tensor each element belongs to
     /// @return The indices of this JaggedTensor
-    const torch::Tensor& jidx() const { return mBatchIdx; }
+    const torch::Tensor &
+    jidx() const {
+        return mBatchIdx;
+    }
 
-    /// @brief Get the offsets of each tensor indexed by this JaggedTensor. i.e. a tensor of size (num_tensors + 1)
-    ///        where joffsets[i] is the start offset in jdata and joffsets[i+1] is the end offset in jdata
+    /// @brief Get the offsets of each tensor indexed by this JaggedTensor. i.e. a tensor of size
+    /// (num_tensors + 1)
+    ///        where joffsets[i] is the start offset in jdata and joffsets[i+1] is the end offset in
+    ///        jdata
     /// @return The offsets of each tensor indexed by this JaggedTensor
-    const torch::Tensor& joffsets() const { return mOffsets; }
+    const torch::Tensor &
+    joffsets() const {
+        return mOffsets;
+    }
 
-    /// @brief Get the list indices of each tensor indexed by this JaggedTensor. i.e. a tensor of size (num_tensors, ldim)
-    ///        where e.g. jlidx[i][j] is the index of the j-th list in the i-th tensor (for a list of lists JaggedTensor)
+    /// @brief Get the list indices of each tensor indexed by this JaggedTensor. i.e. a tensor of
+    /// size (num_tensors, ldim)
+    ///        where e.g. jlidx[i][j] is the index of the j-th list in the i-th tensor (for a list
+    ///        of lists JaggedTensor)
     /// @return The list indices of each tensor indexed by this JaggedTensor
-    const torch::Tensor& jlidx() const { return mListIdx; }
+    const torch::Tensor &
+    jlidx() const {
+        return mListIdx;
+    }
 
     /// @brief Get the number of outer lists in this JaggedTensor
-    int64_t num_outer_lists() const { return mNumOuterLists; }
+    int64_t
+    num_outer_lists() const {
+        return mNumOuterLists;
+    }
 
     /// @brief Get the number of tensors in this JaggedTensor
-    int64_t num_tensors() const { return mOffsets.size(0) - 1; }
+    int64_t
+    num_tensors() const {
+        return mOffsets.size(0) - 1;
+    }
 
-    /// @brief Get the number of elements in each tensor indexed by this JaggedTensor. Assumes the JaggedTensor has ldim() == 1
+    /// @brief Get the number of elements in each tensor indexed by this JaggedTensor. Assumes the
+    /// JaggedTensor has ldim() == 1
     ///        i.e. it represents a list of tensors
     /// @return The number of elements in each tensor indexed by this JaggedTensor
     std::vector<int64_t> lsizes1() const;
 
-    /// @brief Get the number of elements in each tensor indexed by this JaggedTensor. Assumes JaggedTensor has ldim() == 2
+    /// @brief Get the number of elements in each tensor indexed by this JaggedTensor. Assumes
+    /// JaggedTensor has ldim() == 2
     ///        i.e. it represents a list of lists of tensors
-    /// @return The number of elements in each tensor indexed by this JaggedTensor such that lsizes2()[i][j] is the number of elements
+    /// @return The number of elements in each tensor indexed by this JaggedTensor such that
+    /// lsizes2()[i][j] is the number of elements
     ///         in the j-th tensor in i-th list
     std::vector<std::vector<int64_t>> lsizes2() const;
 
-    /// @brief Get the number of nested lists encoded by this JaggedTensor. An ldim of one means this JaggedTensor encodes a list
-    //         of tensors, an ldim of 2 means this JaggedTensor encodes a list of lists of tensors, etc.
+    /// @brief Get the number of nested lists encoded by this JaggedTensor. An ldim of one means
+    /// this JaggedTensor encodes a list
+    //         of tensors, an ldim of 2 means this JaggedTensor encodes a list of lists of tensors,
+    //         etc.
     /// @return The number of nested lists encoded by this JaggedTensor
     int64_t ldim() const;
 
-    /// @brief Get the size of each element indexed by this JaggedTensor. i.e. if the JaggedTensor represents a list of tensors
+    /// @brief Get the size of each element indexed by this JaggedTensor. i.e. if the JaggedTensor
+    /// represents a list of tensors
     ///        where each tensor has shape [N, A, B, C], then esizes() will return [A, B, C]
     /// @return The size of each element indexed by this JaggedTensor
     std::vector<int64_t> esizes() const;
 
-    /// @brief Get the number of dimensions of each element indexed by this JaggedTensor. i.e. if the JaggedTensor represents a list of tensors
+    /// @brief Get the number of dimensions of each element indexed by this JaggedTensor. i.e. if
+    /// the JaggedTensor represents a list of tensors
     ///        where each tensor has shape [N, A, B, C], then edim() will return 3
     /// @return The number of dimensions of each element indexed by this JaggedTensor
     int64_t edim() const;
 
-    /// @brief Convert the JaggedTensor to a list of tensors assuming this JaggedTensor represents a list of tensors.
+    /// @brief Convert the JaggedTensor to a list of tensors assuming this JaggedTensor represents a
+    /// list of tensors.
     ///        Note this function doesn't work for nested lists of tensors (instead use unbind2())
     /// @return A list of tensors where each tensor is indexed by this JaggedTensor.
     std::vector<torch::Tensor> unbind1() const;
 
-    /// @brief Convert the JaggedTensor to a list of lists of tensors assuming this JaggedTensor represents a list of lists of tensors.
+    /// @brief Convert the JaggedTensor to a list of lists of tensors assuming this JaggedTensor
+    /// represents a list of lists of tensors.
     ///        Note this function doesn't work for a flat list of tensors (instead use unbind1())
     /// @return A list of lists of tensors where each tensor is indexed by this JaggedTensor.
     std::vector<std::vector<torch::Tensor>> unbind2() const;
 
-    /// @brief Index this JaggedTensor along the outer list dimension. There are several ways to index a JaggedTensor jt:
-    ///       1. Indexing with an integer jt[i] will return the i^th list in this tensor (or a list containing the i^th
+    /// @brief Index this JaggedTensor along the outer list dimension. There are several ways to
+    /// index a JaggedTensor jt:
+    ///       1. Indexing with an integer jt[i] will return the i^th list in this tensor (or a list
+    ///       containing the i^th
     ///          tensor if jt represents a list of tensors)
-    ///       2. Indexing with a slice jt[2:5] will return a JaggedTensor containing the 2nd, 3rd, and 4th lists in this tensor
+    ///       2. Indexing with a slice jt[2:5] will return a JaggedTensor containing the 2nd, 3rd,
+    ///       and 4th lists in this tensor
     ///          Note: We currently only support cotiguous slices (i.e. stride = 1)
     ///       3. Indexing with another JaggedTensor of boolean mask values jt[mask]
     ///          will return a JaggedTensor containing tensors masked by the boolean mask
     ///          i.e. jt[mask][i][j].jdata = jt[i][j].jdata[mask[i][j].jdata]
-    ///       4. Indexing with a tensor of integer indices jt[indices] will return a JaggedTensor containing tensors
-    ///          indexed by the integer indices. i.e. jt[indices][i][j].jdata = jt[i][j].jdata[indices[i][j]]
+    ///       4. Indexing with a tensor of integer indices jt[indices] will return a JaggedTensor
+    ///       containing tensors
+    ///          indexed by the integer indices. i.e. jt[indices][i][j].jdata =
+    ///          jt[i][j].jdata[indices[i][j]]
     ///       5. Indexing with ellipses jt[...] is a no-op
     /// @param idx The index to use to index this JaggedTensor
     /// @return A JaggedTensor containing the indexed data
     JaggedTensor index(JaggedTensorIndex idx) const;
 
-    /// @brief Reshape this JaggedTensor to have a new list structure. The provided lshape should be compatible with
+    /// @brief Reshape this JaggedTensor to have a new list structure. The provided lshape should be
+    /// compatible with
     ///        this tensor. i.e. the sum of the elements in lshape should be equal to the number of
     ///        elements in this JaggedTensor.
-    ///        Note this function creates a view over the original JaggedTensor so modifying the returned JaggedTensor
-    ///        will modify the original tensor.
+    ///        Note this function creates a view over the original JaggedTensor so modifying the
+    ///        returned JaggedTensor will modify the original tensor.
     /// @param lsizes The new list structure
     /// @return A JaggedTensor with the new list structure
-    JaggedTensor jreshape(const std::vector<int64_t>& lsizes) const;
-    JaggedTensor jreshape(const std::vector<std::vector<int64_t>>& lsizes) const;
+    JaggedTensor jreshape(const std::vector<int64_t> &lsizes) const;
+    JaggedTensor jreshape(const std::vector<std::vector<int64_t>> &lsizes) const;
 
     /// @brief Reshape this JaggedTensor to have the same list structure as another JaggedTensor.
-    ///       Note this function creates a view over the original JaggedTensor so modifying the returned JaggedTensor
-    ///       will modify the original tensor.
-    /// @param other The JaggedTensor to reshape this JaggedTensor to have the same list structure as
+    ///       Note this function creates a view over the original JaggedTensor so modifying the
+    ///       returned JaggedTensor will modify the original tensor.
+    /// @param other The JaggedTensor to reshape this JaggedTensor to have the same list structure
+    /// as
     /// @return A JaggedTensor with the same list structure as the other JaggedTensor
-    JaggedTensor jreshape_as(const JaggedTensor& other) const;
+    JaggedTensor jreshape_as(const JaggedTensor &other) const;
 
-    /// Flatten one of the list dimensions of this JaggedTensor. i.e. if this JaggedTensor represents a list of lists of tensors
-    /// then jflatten(0) will flatten the outer list dimension and jflatten(1) will flatten the inner list dimension.
-    /// e.g. if this JaggedTensor represents a list of lists of tensors [[A, B], [C], [D, E]] then
+    /// Flatten one of the list dimensions of this JaggedTensor. i.e. if this JaggedTensor
+    /// represents a list of lists of tensors then jflatten(0) will flatten the outer list dimension
+    /// and jflatten(1) will flatten the inner list dimension. e.g. if this JaggedTensor represents
+    /// a list of lists of tensors [[A, B], [C], [D, E]] then
     ///     - jflatten(0) will return a JaggedTensor [A, B, C, D, E]
-    ///     - jflatten(1) will return a JaggedTensor [[torch.cat(A, B, dim=0)], [C], [torch.cat(D, E, dim=0)]]
+    ///     - jflatten(1) will return a JaggedTensor [[torch.cat(A, B, dim=0)], [C], [torch.cat(D,
+    ///     E, dim=0)]]
     /// e.g. if this JaggedTensor represents a list of tensors with shapes [A, B, C] then
     ///    - jflatten(0) will return a JaggedTensor with shape [torch.cat(A, B, C, dim=0)]
     ///    - jflatten(1) will raise an exception as there is no inner list dimension
-    /// Note this function creates a view over the original JaggedTensor so modifying the returned JaggedTensor
-    /// will modify the original tensor.
+    /// Note this function creates a view over the original JaggedTensor so modifying the returned
+    /// JaggedTensor will modify the original tensor.
     /// @param dim The dimension to flatten
     /// @return A JaggedTensor with the flattened list dimension
     JaggedTensor jflatten(const int64_t dim = 0) const;
 
     /// @brief Sorts each batch element in ascending order, note that jdata has to be 1-dimensional
-    /// @return An indexing tensor with the same size as jdata, that permutes the elements of data to be in sorted order
+    /// @return An indexing tensor with the same size as jdata, that permutes the elements of data
+    /// to be in sorted order
     // JaggedTensor jagged_argsort();
 
     /// @brief Compute the summation of each batch element
     /// @param dim The dimension to sum over
     /// @param keepdim Whether to keep the summed dimension
-    /// @return A tensor of size (batch_size, *) containing the sum of each batch element, feature dimensions are preserved
+    /// @return A tensor of size (batch_size, *) containing the sum of each batch element, feature
+    /// dimensions are preserved
     JaggedTensor jsum(int64_t dim = 0, bool keepdim = false) const;
 
     /// @brief Compute the minimum of each batch element
@@ -342,31 +411,43 @@ class JaggedTensor : public torch::CustomClassHolder {
     std::vector<JaggedTensor> jmax(int64_t dim = 0, bool keepdim = false) const;
 
     // Operators on raw data
-    inline int64_t rsize(int64_t dim) const { return mData.size(dim); }
-    inline int64_t rdim() const { return mData.dim(); }
-    inline std::vector<int64_t> rsizes() const { return mData.sizes().vec(); }
-    JaggedTensor rmask(const torch::Tensor& mask) const;
+    inline int64_t
+    rsize(int64_t dim) const {
+        return mData.size(dim);
+    }
+    inline int64_t
+    rdim() const {
+        return mData.dim();
+    }
+    inline std::vector<int64_t>
+    rsizes() const {
+        return mData.sizes().vec();
+    }
+    JaggedTensor rmask(const torch::Tensor &mask) const;
 
-    /// @brief Get an accessor for the JaggedTensor. Useful for reading/writing values in the JaggedTensor
+    /// @brief Get an accessor for the JaggedTensor. Useful for reading/writing values in the
+    /// JaggedTensor
     /// @tparam Scalar The type of the data in the JaggedTensor
     /// @tparam NDims The number of dimensions of the data in the JaggedTensor (i.e. edim() + 1)
     /// @return An accessor for the JaggedTensor
     template <typename Scalar, size_t NDims>
-    JaggedAccessor<Scalar, NDims> accessor() const {
+    JaggedAccessor<Scalar, NDims>
+    accessor() const {
         return JaggedAccessor<Scalar, NDims>(
-            mBatchIdx.accessor<JIdxType, 1>(),
-            mOffsets.accessor<JOffsetsType, 1>(),
-            mListIdx.accessor<JLIdxType, 2>(),
-            mData.accessor<Scalar, NDims>());
+            mBatchIdx.accessor<JIdxType, 1>(), mOffsets.accessor<JOffsetsType, 1>(),
+            mListIdx.accessor<JLIdxType, 2>(), mData.accessor<Scalar, NDims>());
     }
 
-    /// @brief Get a packed accessor for the JaggedTensor. Useful for reading/writing values in the JaggedTensor in Cuda
+    /// @brief Get a packed accessor for the JaggedTensor. Useful for reading/writing values in the
+    /// JaggedTensor in Cuda
     /// @tparam Scalar The type of the data in the JaggedTensor
     /// @tparam NDims The number of dimensions of the data in the JaggedTensor (i.e. edim() + 1)
     /// @tparam PtrTraits The type of the pointer traits for the packed accessor
     /// @return A packed accessor for the JaggedTensor
-    template <typename Scalar, size_t NDims, template <typename U> typename PtrTraits = torch::DefaultPtrTraits>
-    PackedJaggedAccessor32<Scalar, NDims, PtrTraits> packed_accessor32() const {
+    template <typename Scalar, size_t NDims,
+              template <typename U> typename PtrTraits = torch::DefaultPtrTraits>
+    PackedJaggedAccessor32<Scalar, NDims, PtrTraits>
+    packed_accessor32() const {
         return PackedJaggedAccessor32<Scalar, NDims, PtrTraits>(
             mBatchIdx.packed_accessor32<JIdxType, 1, PtrTraits>(),
             mOffsets.packed_accessor32<JOffsetsType, 1, PtrTraits>(),
@@ -375,217 +456,255 @@ class JaggedTensor : public torch::CustomClassHolder {
     }
 
     /// @brief Raise an exception if the JaggedTensor is in an invalid state
-    inline void check_valid() const {
-        TORCH_CHECK((jidx().size(0) == 0 && joffsets().size(0) == 2) || (jidx().size(0) == jdata().size(0)), "tensor must be a valid JaggedTensor");
-        TORCH_CHECK(jidx().device() == jdata().device(), "batch index and data must be on the same device");
+    inline void
+    check_valid() const {
+        TORCH_CHECK((jidx().size(0) == 0 && joffsets().size(0) == 2) ||
+                        (jidx().size(0) == jdata().size(0)),
+                    "tensor must be a valid JaggedTensor");
+        TORCH_CHECK(jidx().device() == jdata().device(),
+                    "batch index and data must be on the same device");
         TORCH_CHECK(jidx().dtype() == JIdxScalarType, "batch index must be int");
-        TORCH_CHECK(joffsets().device() == jdata().device(), "offsets and data must be on the same device");
-        TORCH_CHECK_VALUE(jlidx().numel() == 0 || jlidx().size(0) == (joffsets().size(0) - 1), "Corrupt list indices. This should never happen");
+        TORCH_CHECK(joffsets().device() == jdata().device(),
+                    "offsets and data must be on the same device");
+        TORCH_CHECK_VALUE(jlidx().numel() == 0 || jlidx().size(0) == (joffsets().size(0) - 1),
+                          "Corrupt list indices. This should never happen");
     }
 
-    inline int64_t element_count() const {
+    inline int64_t
+    element_count() const {
         return jdata().size(0);
     }
 
-    inline torch::Device device() const {
+    inline torch::Device
+    device() const {
         return mData.device();
     }
 
-    caffe2::TypeMeta dtype() const {
+    caffe2::TypeMeta
+    dtype() const {
         return mData.dtype();
     }
 
-    torch::Layout layout() const {
+    torch::Layout
+    layout() const {
         return mData.layout();
     }
 
-    inline torch::ScalarType scalar_type() const {
+    inline torch::ScalarType
+    scalar_type() const {
         return mData.scalar_type();
     }
 
-    inline bool is_cuda() const {
+    inline bool
+    is_cuda() const {
         return mData.is_cuda();
     }
 
-    inline bool is_cpu() const {
+    inline bool
+    is_cpu() const {
         return mData.is_cpu();
     }
 
-    int64_t get_device() const {
+    int64_t
+    get_device() const {
         return mData.get_device();
     }
 
-    bool is_complex() const {
+    bool
+    is_complex() const {
         return at::isComplexType(this->scalar_type());
     }
 
-    bool is_floating_point() const {
+    bool
+    is_floating_point() const {
         return at::isFloatingType(this->scalar_type());
     }
 
-    bool is_signed() const {
+    bool
+    is_signed() const {
         return at::isSignedType(this->scalar_type());
     }
 
-    int64_t numel() const {
+    int64_t
+    numel() const {
         return mData.numel();
     }
 
-    inline bool is_contiguous() const {
+    inline bool
+    is_contiguous() const {
         return mData.is_contiguous();
     }
 
-    inline JaggedTensor contiguous() const {
-        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(mData.contiguous(), mOffsets.contiguous(), mBatchIdx.contiguous(), mListIdx.contiguous(), mNumOuterLists);
+    inline JaggedTensor
+    contiguous() const {
+        return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+            mData.contiguous(), mOffsets.contiguous(), mBatchIdx.contiguous(),
+            mListIdx.contiguous(), mNumOuterLists);
     }
 
-    inline JaggedTensor to(at::TensorOptions options={}, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) const {
+    inline JaggedTensor
+    to(at::TensorOptions options = {}, bool non_blocking = false, bool copy = false,
+       c10::optional<at::MemoryFormat> memory_format = c10::nullopt) const {
         JaggedTensor ret = *this;
-        ret.mData = ret.mData.to(options, non_blocking, copy, memory_format);
-        ret.mBatchIdx = ret.mBatchIdx.to(ret.mData.device(), non_blocking, copy, memory_format);
-        ret.mOffsets = ret.mOffsets.to(ret.mData.device(), non_blocking, copy, memory_format);
-        ret.mListIdx = ret.mListIdx.to(ret.mData.device(), non_blocking, copy, memory_format);
+        ret.mData        = ret.mData.to(options, non_blocking, copy, memory_format);
+        ret.mBatchIdx    = ret.mBatchIdx.to(ret.mData.device(), non_blocking, copy, memory_format);
+        ret.mOffsets     = ret.mOffsets.to(ret.mData.device(), non_blocking, copy, memory_format);
+        ret.mListIdx     = ret.mListIdx.to(ret.mData.device(), non_blocking, copy, memory_format);
         return ret;
     }
 
-    inline JaggedTensor to(c10::optional<torch::ScalarType> dtype, c10::optional<at::Layout> layout, c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format) {
+    inline JaggedTensor
+    to(c10::optional<torch::ScalarType> dtype, c10::optional<at::Layout> layout,
+       c10::optional<at::Device> device, c10::optional<bool> pin_memory, bool non_blocking,
+       bool copy, c10::optional<at::MemoryFormat> memory_format) {
         JaggedTensor ret = *this;
-        ret.mData = ret.mData.to(dtype, layout, device, pin_memory, non_blocking, copy, memory_format);
-        ret.mBatchIdx = ret.mBatchIdx.to(JIdxScalarType, layout, device, pin_memory, non_blocking, copy, memory_format);
-        ret.mOffsets = ret.mOffsets.to(JOffsetsScalarType, layout, device, pin_memory, non_blocking, copy, memory_format);
-        ret.mListIdx = ret.mListIdx.to(JLIdxScalarType, layout, device, pin_memory, non_blocking, copy, memory_format);
+        ret.mData =
+            ret.mData.to(dtype, layout, device, pin_memory, non_blocking, copy, memory_format);
+        ret.mBatchIdx = ret.mBatchIdx.to(JIdxScalarType, layout, device, pin_memory, non_blocking,
+                                         copy, memory_format);
+        ret.mOffsets = ret.mOffsets.to(JOffsetsScalarType, layout, device, pin_memory, non_blocking,
+                                       copy, memory_format);
+        ret.mListIdx = ret.mListIdx.to(JLIdxScalarType, layout, device, pin_memory, non_blocking,
+                                       copy, memory_format);
         return ret;
     }
-    inline JaggedTensor to(torch::Device device, torch::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+    inline JaggedTensor
+    to(torch::Device device, torch::ScalarType dtype, bool non_blocking = false, bool copy = false,
+       c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
         JaggedTensor ret = *this;
-        ret.mData = ret.mData.to(device, dtype, non_blocking, copy, memory_format);
-        ret.mBatchIdx = ret.mBatchIdx.to(device, non_blocking, copy, memory_format);
-        ret.mOffsets = ret.mOffsets.to(device, non_blocking, copy, memory_format);
-        ret.mListIdx = ret.mListIdx.to(device, non_blocking, copy, memory_format);
+        ret.mData        = ret.mData.to(device, dtype, non_blocking, copy, memory_format);
+        ret.mBatchIdx    = ret.mBatchIdx.to(device, non_blocking, copy, memory_format);
+        ret.mOffsets     = ret.mOffsets.to(device, non_blocking, copy, memory_format);
+        ret.mListIdx     = ret.mListIdx.to(device, non_blocking, copy, memory_format);
         return ret;
     }
-    inline JaggedTensor to(torch::ScalarType dtype, bool non_blocking=false, bool copy=false, c10::optional<at::MemoryFormat> memory_format=c10::nullopt) {
+    inline JaggedTensor
+    to(torch::ScalarType dtype, bool non_blocking = false, bool copy = false,
+       c10::optional<at::MemoryFormat> memory_format = c10::nullopt) {
         JaggedTensor ret = *this;
-        ret.mData = ret.mData.to(dtype, non_blocking, copy, memory_format);
-        ret.mBatchIdx = ret.mBatchIdx.to(JIdxScalarType, non_blocking, copy, memory_format);
-        ret.mOffsets = ret.mOffsets.to(JOffsetsScalarType, non_blocking, copy, memory_format);
-        ret.mListIdx = ret.mListIdx.to(JLIdxScalarType, non_blocking, copy, memory_format);
+        ret.mData        = ret.mData.to(dtype, non_blocking, copy, memory_format);
+        ret.mBatchIdx    = ret.mBatchIdx.to(JIdxScalarType, non_blocking, copy, memory_format);
+        ret.mOffsets     = ret.mOffsets.to(JOffsetsScalarType, non_blocking, copy, memory_format);
+        ret.mListIdx     = ret.mListIdx.to(JLIdxScalarType, non_blocking, copy, memory_format);
         return ret;
     }
 
-    torch::TensorOptions options() const {
+    torch::TensorOptions
+    options() const {
         return torch::TensorOptions().dtype(dtype()).device(device()).layout(layout());
     }
 
-    JaggedTensor cuda() const {
+    JaggedTensor
+    cuda() const {
         return to(this->options().device(torch::kCUDA), /*non_blocking*/ false, /*copy*/ false);
     }
 
-    JaggedTensor cpu() const {
+    JaggedTensor
+    cpu() const {
         return to(this->options().device(torch::kCPU), /*non_blocking*/ false, /*copy*/ false);
     }
 
-    JaggedTensor operator+(const JaggedTensor& other) const;
+    JaggedTensor operator+(const JaggedTensor &other) const;
     JaggedTensor operator+(const int other) const;
     JaggedTensor operator+(const float other) const;
-    JaggedTensor operator+(const torch::Tensor& other) const;
+    JaggedTensor operator+(const torch::Tensor &other) const;
 
-    JaggedTensor& operator+=(const JaggedTensor& other);
-    JaggedTensor& operator+=(const int other);
-    JaggedTensor& operator+=(const float other);
-    JaggedTensor& operator+=(const torch::Tensor& other);
+    JaggedTensor &operator+=(const JaggedTensor &other);
+    JaggedTensor &operator+=(const int other);
+    JaggedTensor &operator+=(const float other);
+    JaggedTensor &operator+=(const torch::Tensor &other);
 
-    JaggedTensor operator-(const JaggedTensor& other) const;
+    JaggedTensor operator-(const JaggedTensor &other) const;
     JaggedTensor operator-(const int other) const;
     JaggedTensor operator-(const float other) const;
-    JaggedTensor operator-(const torch::Tensor& other) const;
+    JaggedTensor operator-(const torch::Tensor &other) const;
 
     JaggedTensor operator-() const;
 
-    JaggedTensor& operator-=(const JaggedTensor& other);
-    JaggedTensor& operator-=(const int other);
-    JaggedTensor& operator-=(const float other);
-    JaggedTensor& operator-=(const torch::Tensor& other);
+    JaggedTensor &operator-=(const JaggedTensor &other);
+    JaggedTensor &operator-=(const int other);
+    JaggedTensor &operator-=(const float other);
+    JaggedTensor &operator-=(const torch::Tensor &other);
 
-    JaggedTensor operator*(const JaggedTensor& other) const;
+    JaggedTensor operator*(const JaggedTensor &other) const;
     JaggedTensor operator*(const int other) const;
     JaggedTensor operator*(const float other) const;
-    JaggedTensor operator*(const torch::Tensor& other) const;
+    JaggedTensor operator*(const torch::Tensor &other) const;
 
-    JaggedTensor& operator*=(const JaggedTensor& other);
-    JaggedTensor& operator*=(const int other);
-    JaggedTensor& operator*=(const float other);
-    JaggedTensor& operator*=(const torch::Tensor& other);
+    JaggedTensor &operator*=(const JaggedTensor &other);
+    JaggedTensor &operator*=(const int other);
+    JaggedTensor &operator*=(const float other);
+    JaggedTensor &operator*=(const torch::Tensor &other);
 
-    JaggedTensor operator/(const JaggedTensor& other) const;
+    JaggedTensor operator/(const JaggedTensor &other) const;
     JaggedTensor operator/(const int other) const;
     JaggedTensor operator/(const float other) const;
-    JaggedTensor operator/(const torch::Tensor& other) const;
+    JaggedTensor operator/(const torch::Tensor &other) const;
 
-    JaggedTensor& operator/=(const JaggedTensor& other);
-    JaggedTensor& operator/=(const int other);
-    JaggedTensor& operator/=(const float other);
-    JaggedTensor& operator/=(const torch::Tensor& other);
+    JaggedTensor &operator/=(const JaggedTensor &other);
+    JaggedTensor &operator/=(const int other);
+    JaggedTensor &operator/=(const float other);
+    JaggedTensor &operator/=(const torch::Tensor &other);
 
-    JaggedTensor floordiv(const JaggedTensor& other) const;
+    JaggedTensor floordiv(const JaggedTensor &other) const;
     JaggedTensor floordiv(const int other) const;
     JaggedTensor floordiv(const float other) const;
-    JaggedTensor floordiv(const torch::Tensor& other) const;
+    JaggedTensor floordiv(const torch::Tensor &other) const;
 
-    JaggedTensor& floordiveq(const JaggedTensor& other);
-    JaggedTensor& floordiveq(const int other);
-    JaggedTensor& floordiveq(const float other);
-    JaggedTensor& floordiveq(const torch::Tensor& other);
+    JaggedTensor &floordiveq(const JaggedTensor &other);
+    JaggedTensor &floordiveq(const int other);
+    JaggedTensor &floordiveq(const float other);
+    JaggedTensor &floordiveq(const torch::Tensor &other);
 
-    JaggedTensor operator%(const JaggedTensor& other) const;
+    JaggedTensor operator%(const JaggedTensor &other) const;
     JaggedTensor operator%(const int other) const;
     JaggedTensor operator%(const float other) const;
-    JaggedTensor operator%(const torch::Tensor& other) const;
+    JaggedTensor operator%(const torch::Tensor &other) const;
 
-    JaggedTensor& operator%=(const JaggedTensor& other);
-    JaggedTensor& operator%=(const int other);
-    JaggedTensor& operator%=(const float other);
-    JaggedTensor& operator%=(const torch::Tensor& other);
+    JaggedTensor &operator%=(const JaggedTensor &other);
+    JaggedTensor &operator%=(const int other);
+    JaggedTensor &operator%=(const float other);
+    JaggedTensor &operator%=(const torch::Tensor &other);
 
-    JaggedTensor pow(const JaggedTensor& other) const;
+    JaggedTensor pow(const JaggedTensor &other) const;
     JaggedTensor pow(const int other) const;
     JaggedTensor pow(const float other) const;
-    JaggedTensor pow(const torch::Tensor& other) const;
+    JaggedTensor pow(const torch::Tensor &other) const;
 
-    JaggedTensor& poweq(const JaggedTensor& other);
-    JaggedTensor& poweq(const int other);
-    JaggedTensor& poweq(const float other);
-    JaggedTensor& poweq(const torch::Tensor& other);
+    JaggedTensor &poweq(const JaggedTensor &other);
+    JaggedTensor &poweq(const int other);
+    JaggedTensor &poweq(const float other);
+    JaggedTensor &poweq(const torch::Tensor &other);
 
-    JaggedTensor operator>(const JaggedTensor& other) const;
+    JaggedTensor operator>(const JaggedTensor &other) const;
     JaggedTensor operator>(const int other) const;
     JaggedTensor operator>(const float other) const;
-    JaggedTensor operator>(const torch::Tensor& other) const;
+    JaggedTensor operator>(const torch::Tensor &other) const;
 
-    JaggedTensor operator>=(const JaggedTensor& other) const;
+    JaggedTensor operator>=(const JaggedTensor &other) const;
     JaggedTensor operator>=(const int other) const;
     JaggedTensor operator>=(const float other) const;
-    JaggedTensor operator>=(const torch::Tensor& other) const;
+    JaggedTensor operator>=(const torch::Tensor &other) const;
 
-    JaggedTensor operator<(const JaggedTensor& other) const;
+    JaggedTensor operator<(const JaggedTensor &other) const;
     JaggedTensor operator<(const int other) const;
     JaggedTensor operator<(const float other) const;
-    JaggedTensor operator<(const torch::Tensor& other) const;
+    JaggedTensor operator<(const torch::Tensor &other) const;
 
-    JaggedTensor operator<=(const JaggedTensor& other) const;
+    JaggedTensor operator<=(const JaggedTensor &other) const;
     JaggedTensor operator<=(const int other) const;
     JaggedTensor operator<=(const float other) const;
-    JaggedTensor operator<=(const torch::Tensor& other) const;
+    JaggedTensor operator<=(const torch::Tensor &other) const;
 
-    JaggedTensor operator==(const JaggedTensor& other) const;
+    JaggedTensor operator==(const JaggedTensor &other) const;
     JaggedTensor operator==(const int other) const;
     JaggedTensor operator==(const float other) const;
-    JaggedTensor operator==(const torch::Tensor& other) const;
+    JaggedTensor operator==(const torch::Tensor &other) const;
 
-    JaggedTensor operator!=(const JaggedTensor& other) const;
+    JaggedTensor operator!=(const JaggedTensor &other) const;
     JaggedTensor operator!=(const int other) const;
     JaggedTensor operator!=(const float other) const;
-    JaggedTensor operator!=(const torch::Tensor& other) const;
+    JaggedTensor operator!=(const torch::Tensor &other) const;
 
     JaggedTensor sqrt() const;
     JaggedTensor abs() const;
@@ -593,88 +712,111 @@ class JaggedTensor : public torch::CustomClassHolder {
     JaggedTensor floor() const;
     JaggedTensor ceil() const;
 
-    JaggedTensor& sqrt_();
-    JaggedTensor& abs_();
-    JaggedTensor& round_(int decimals = 0);
-    JaggedTensor& floor_();
-    JaggedTensor& ceil_();
+    JaggedTensor &sqrt_();
+    JaggedTensor &abs_();
+    JaggedTensor &round_(int decimals = 0);
+    JaggedTensor &floor_();
+    JaggedTensor &ceil_();
 
-    const JaggedTensor& set_requires_grad(bool requires_grad) const;
-    bool requires_grad() const;
-    JaggedTensor detach() const;
-    JaggedTensor clone() const;
+    const JaggedTensor &set_requires_grad(bool requires_grad) const;
+    bool                requires_grad() const;
+    JaggedTensor        detach() const;
+    JaggedTensor        clone() const;
 };
 
-
 struct JaggedTensorIndex {
     JaggedTensorIndex(c10::nullopt_t) : mType(JaggedTensorIndexType::None) {}
     JaggedTensorIndex(int64_t integer) : mType(JaggedTensorIndexType::Integer), mInteger(integer) {}
-    JaggedTensorIndex(torch::indexing::EllipsisIndexType) : mType(JaggedTensorIndexType::Ellipsis) {}
+    JaggedTensorIndex(torch::indexing::EllipsisIndexType)
+        : mType(JaggedTensorIndexType::Ellipsis) {}
     JaggedTensorIndex(at::Tensor tensor) : mType(JaggedTensorIndexType::Tensor), mTensor(tensor) {}
-    JaggedTensorIndex(torch::indexing::Slice slice) : mType(JaggedTensorIndexType::Slice), mSlice(slice) {}
-    JaggedTensorIndex(fvdb::JaggedTensor jaggedTensor) : mType(JaggedTensorIndexType::JaggedTensor), mJaggedTensor(jaggedTensor) {}
+    JaggedTensorIndex(torch::indexing::Slice slice)
+        : mType(JaggedTensorIndexType::Slice), mSlice(slice) {}
+    JaggedTensorIndex(fvdb::JaggedTensor jaggedTensor)
+        : mType(JaggedTensorIndexType::JaggedTensor), mJaggedTensor(jaggedTensor) {}
 
     template <class T, class = typename std::enable_if<std::is_same<bool, T>::value>::type>
     JaggedTensorIndex(T boolean) : mType(JaggedTensorIndexType::Boolean), mBoolean(boolean) {}
 
-    inline bool is_none() const {
+    inline bool
+    is_none() const {
         return mType == JaggedTensorIndexType::None;
     }
 
-    inline bool is_ellipsis() const {
+    inline bool
+    is_ellipsis() const {
         return mType == JaggedTensorIndexType::Ellipsis;
     }
 
-    inline bool is_integer() const {
+    inline bool
+    is_integer() const {
         return mType == JaggedTensorIndexType::Integer;
     }
 
-    inline bool is_boolean() const {
+    inline bool
+    is_boolean() const {
         return mType == JaggedTensorIndexType::Boolean;
     }
 
-    inline bool is_slice() const {
+    inline bool
+    is_slice() const {
         return mType == JaggedTensorIndexType::Slice;
     }
 
-    inline bool is_tensor() const {
+    inline bool
+    is_tensor() const {
         return mType == JaggedTensorIndexType::Tensor;
     }
 
-    inline bool is_jagged_tensor() const {
+    inline bool
+    is_jagged_tensor() const {
         return mType == JaggedTensorIndexType::JaggedTensor;
     }
 
-    inline int64_t integer() const {
+    inline int64_t
+    integer() const {
         return mInteger;
     }
 
-    inline bool boolean() const {
+    inline bool
+    boolean() const {
         return mBoolean;
     }
 
-    inline const torch::indexing::Slice& slice() const {
+    inline const torch::indexing::Slice &
+    slice() const {
         return mSlice;
     }
 
-    inline const torch::Tensor& tensor() const {
+    inline const torch::Tensor &
+    tensor() const {
         return mTensor;
     }
 
-    inline const fvdb::JaggedTensor& jagged_tensor() const {
+    inline const fvdb::JaggedTensor &
+    jagged_tensor() const {
         return mJaggedTensor;
     }
 
-private:
-    enum class JaggedTensorIndexType { None, Ellipsis, Integer, Slice, Tensor, Boolean, JaggedTensor };
+  private:
+    enum class JaggedTensorIndexType {
+        None,
+        Ellipsis,
+        Integer,
+        Slice,
+        Tensor,
+        Boolean,
+        JaggedTensor
+    };
     JaggedTensorIndexType mType;
 
-    torch::Tensor mTensor;
-    int64_t mInteger;
+    torch::Tensor          mTensor;
+    int64_t                mInteger;
     torch::indexing::Slice mSlice;
-    bool mBoolean;
-    fvdb::JaggedTensor mJaggedTensor;
+    bool                   mBoolean;
+    fvdb::JaggedTensor     mJaggedTensor;
 };
 
+} // namespace fvdb
 
-} // namespace fvdb
\ No newline at end of file
+#endif // FVDB_JAGGEDTENSOR_H
\ No newline at end of file
diff --git a/fvdb/src/SparseConvPackInfo.cpp b/fvdb/src/SparseConvPackInfo.cpp
index 4f8a46ed63..cb2f9fca94 100644
--- a/fvdb/src/SparseConvPackInfo.cpp
+++ b/fvdb/src/SparseConvPackInfo.cpp
@@ -3,17 +3,19 @@
 //
 #include "SparseConvPackInfo.h"
 
+#include "detail/autograd/Autograd.h"
 #include "detail/ops/Ops.h"
 #include "detail/ops/convolution/pack_info/PackInfoOps.h"
-#include "detail/autograd/Autograd.h"
-
 
 namespace fvdb {
 
-SparseConvPackInfo::SparseConvPackInfo(Vec3iOrScalar kernelsize, Vec3iOrScalar stride, GridBatch srcGrid,
-                                       torch::optional<GridBatch> maybeTarget) {
-    TORCH_CHECK(Vec3iOrScalar(0).value() < kernelsize.value(), "Expect kernel size to be larger than {0,0,0}, but got " + kernelsize.toString() + ".");
-    TORCH_CHECK(Vec3iOrScalar(0).value() < stride.value(), "Expect stride to be larger than 0, but got " + stride.toString() + ".");
+SparseConvPackInfo::SparseConvPackInfo(Vec3iOrScalar kernelsize, Vec3iOrScalar stride,
+                                       GridBatch srcGrid, torch::optional<GridBatch> maybeTarget) {
+    TORCH_CHECK(Vec3iOrScalar(0).value() < kernelsize.value(),
+                "Expect kernel size to be larger than {0,0,0}, but got " + kernelsize.toString() +
+                    ".");
+    TORCH_CHECK(Vec3iOrScalar(0).value() < stride.value(),
+                "Expect stride to be larger than 0, but got " + stride.toString() + ".");
 
     GridBatch targetGrid;
     if (!maybeTarget.has_value()) {
@@ -26,47 +28,56 @@ SparseConvPackInfo::SparseConvPackInfo(Vec3iOrScalar kernelsize, Vec3iOrScalar s
         targetGrid = maybeTarget.value();
     }
 
-    TORCH_CHECK(srcGrid.is_mutable() == targetGrid.is_mutable(), "Source and target grids must both be mutable or immutable");
-    TORCH_CHECK(srcGrid.device() == targetGrid.device(), "Source and target grids must both be on the same device");
-    TORCH_CHECK(srcGrid.device() == targetGrid.device(), "Device should match between this grid and target grid.");
-    TORCH_CHECK(!(kernelsize.value() == Vec3iOrScalar(1).value() && stride.value() == Vec3iOrScalar(1).value()), "1x1 conv does not need kernel map to be built!");
-
-    mStride = stride;
+    TORCH_CHECK(srcGrid.is_mutable() == targetGrid.is_mutable(),
+                "Source and target grids must both be mutable or immutable");
+    TORCH_CHECK(srcGrid.device() == targetGrid.device(),
+                "Source and target grids must both be on the same device");
+    TORCH_CHECK(srcGrid.device() == targetGrid.device(),
+                "Device should match between this grid and target grid.");
+    TORCH_CHECK(!(kernelsize.value() == Vec3iOrScalar(1).value() &&
+                  stride.value() == Vec3iOrScalar(1).value()),
+                "1x1 conv does not need kernel map to be built!");
+
+    mStride     = stride;
     mKernelSize = kernelsize;
     mTargetGrid = targetGrid;
     mSourceGrid = srcGrid;
 }
 
-void SparseConvPackInfo::buildGatherScatter(bool use_me) {
+void
+SparseConvPackInfo::buildGatherScatter(bool use_me) {
     if (mGSNeighborMap.has_value() && mGSNeighborSizes.has_value()) {
-        TORCH_CHECK(mGSUseME == use_me, "Gather scatter is already built with different use_me value");
+        TORCH_CHECK(mGSUseME == use_me,
+                    "Gather scatter is already built with different use_me value");
         return;
     }
 
     int kernelVolume = mKernelSize.value().x() * mKernelSize.value().y() * mKernelSize.value().z();
 
-    torch::Tensor kmap = torch::full(
-        {mTargetGrid.total_voxels(), kernelVolume}, -1,
-        torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
+    torch::Tensor kmap =
+        torch::full({ mTargetGrid.total_voxels(), kernelVolume }, -1,
+                    torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
 
     FVDB_DISPATCH_KERNEL_DEVICE(mSourceGrid.device(), [&]() {
         detail::ops::dispatchConvolutionKernelMap<DeviceTag>(
             *mSourceGrid.impl(), *mTargetGrid.impl(), kmap, mKernelSize, mStride);
     });
-    kmap = kmap.t();
-    torch::Tensor kmask = kmap != -1;
+    kmap                  = kmap.t();
+    torch::Tensor kmask   = kmap != -1;
     torch::Tensor nbsizes = torch::sum(kmask, -1);
-    torch::Tensor nbmap = torch::nonzero(kmask).contiguous();
+    torch::Tensor nbmap   = torch::nonzero(kmask).contiguous();
 
-    torch::Tensor indices = nbmap.index({torch::indexing::Slice(), 0}) * kmap.size(1) + \
-        nbmap.index({torch::indexing::Slice(), 1});
-    nbmap.index_put_({torch::indexing::Slice(), 0}, kmap.reshape({-1}).index({indices}));
-    mGSNeighborMap = nbmap.to(torch::kInt32);
+    torch::Tensor indices = nbmap.index({ torch::indexing::Slice(), 0 }) * kmap.size(1) +
+                            nbmap.index({ torch::indexing::Slice(), 1 });
+    nbmap.index_put_({ torch::indexing::Slice(), 0 }, kmap.reshape({ -1 }).index({ indices }));
+    mGSNeighborMap   = nbmap.to(torch::kInt32);
     mGSNeighborSizes = nbsizes.to(torch::kInt32);
-    mGSUseME = use_me;
+    mGSUseME         = use_me;
 }
 
-void SparseConvPackInfo::buildImplicitGEMM(bool sorted, int splitMaskNum, bool training, int splitMaskNumBwd, bool use_tf32) {
+void
+SparseConvPackInfo::buildImplicitGEMM(bool sorted, int splitMaskNum, bool training,
+                                      int splitMaskNumBwd, bool use_tf32) {
     if (mIGEMMOutInMap.has_value()) {
         if (mIGEMMReorderLoc.has_value()) {
             TORCH_CHECK(mIGEMMReorderLoc->size(0) == splitMaskNum,
@@ -80,9 +91,9 @@ void SparseConvPackInfo::buildImplicitGEMM(bool sorted, int splitMaskNum, bool t
     int kernelVolume = mKernelSize.value().x() * mKernelSize.value().y() * mKernelSize.value().z();
 
     int outInMapSize = (mTargetGrid.total_voxels() + 128 - 1) / 128 * 128;
-    mIGEMMOutInMap = torch::full(
-        {outInMapSize, kernelVolume}, -1,
-        torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
+    mIGEMMOutInMap =
+        torch::full({ outInMapSize, kernelVolume }, -1,
+                    torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
     mIGEMMUseTF32 = use_tf32;
 
     // Note: This could also be converted from GSNeighbourMap if exists
@@ -92,141 +103,146 @@ void SparseConvPackInfo::buildImplicitGEMM(bool sorted, int splitMaskNum, bool t
     });
 
     if (sorted) {
-        TORCH_CHECK(mSourceGrid.device().is_cuda(), "Implicit GEMM with sorted kernel map is only supported on CUDA");
+        TORCH_CHECK(mSourceGrid.device().is_cuda(),
+                    "Implicit GEMM with sorted kernel map is only supported on CUDA");
         torch::Tensor bitmask = detail::ops::dispatchBitmaskFromOutInMap<torch::kCUDA>(
             mIGEMMOutInMap.value(), splitMaskNum, mTargetGrid.total_voxels());
-        auto ret = torch::sort(bitmask, -1L, true);
-        mIGEMMSortedMask = std::get<0>(ret);    // Mainly used for transpose.
-        mIGEMMReorderLoc = std::get<1>(ret).to(torch::kInt32);
+        auto ret             = torch::sort(bitmask, -1L, true);
+        mIGEMMSortedMask     = std::get<0>(ret); // Mainly used for transpose.
+        mIGEMMReorderLoc     = std::get<1>(ret).to(torch::kInt32);
         mIGEMMReoderOutInMap = detail::ops::dispatchReorderOutInMap<torch::kCUDA>(
             mIGEMMOutInMap.value(), mIGEMMReorderLoc.value());
-        mIGEMMReducedSortedMask = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            mIGEMMSortedMask.value(), 128);
+        mIGEMMReducedSortedMask =
+            detail::ops::dispatchReduceMask<torch::kCUDA>(mIGEMMSortedMask.value(), 128);
     }
 
     if (training) {
         int outInMapTSize = (mSourceGrid.total_voxels() + 128 - 1) / 128 * 128;
-        mIGEMMOutInMapBwd = torch::full(
-            {outInMapTSize, kernelVolume}, -1,
-            torch::TensorOptions().dtype(torch::kInt32).device(mSourceGrid.device()));
-        detail::ops::dispatchTransposeOutInMap<torch::kCUDA>(
-            mIGEMMOutInMap.value(), mIGEMMOutInMapBwd.value());
+        mIGEMMOutInMapBwd =
+            torch::full({ outInMapTSize, kernelVolume }, -1,
+                        torch::TensorOptions().dtype(torch::kInt32).device(mSourceGrid.device()));
+        detail::ops::dispatchTransposeOutInMap<torch::kCUDA>(mIGEMMOutInMap.value(),
+                                                             mIGEMMOutInMapBwd.value());
         torch::Tensor bitmask = detail::ops::dispatchBitmaskFromOutInMap<torch::kCUDA>(
             mIGEMMOutInMapBwd.value(), splitMaskNumBwd, mSourceGrid.total_voxels());
-        auto ret = torch::sort(bitmask, -1L, true);
+        auto          ret           = torch::sort(bitmask, -1L, true);
         torch::Tensor sortedMaskBwd = std::get<0>(ret);
-        mIGEMMReorderLocBwd = std::get<1>(ret).to(torch::kInt32);
-        mIGEMMReorderOutInMapBwd = detail::ops::dispatchReorderOutInMap<torch::kCUDA>(
+        mIGEMMReorderLocBwd         = std::get<1>(ret).to(torch::kInt32);
+        mIGEMMReorderOutInMapBwd    = detail::ops::dispatchReorderOutInMap<torch::kCUDA>(
             mIGEMMOutInMapBwd.value(), mIGEMMReorderLocBwd.value());
-        mIGEMMSortedMaskBwdW = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            sortedMaskBwd, 64);
-        mIGEMMSortedMaskBwdD = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            sortedMaskBwd, 128);
+        mIGEMMSortedMaskBwdW = detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 64);
+        mIGEMMSortedMaskBwdD = detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 128);
     }
-
 }
 
-SparseConvPackInfo SparseConvPackInfo::transposed() const {
+SparseConvPackInfo
+SparseConvPackInfo::transposed() const {
     SparseConvPackInfo ret(mKernelSize, mStride, mSourceGrid, mTargetGrid);
-    bool sorted = mIGEMMReorderLoc.has_value();
-    bool training = mIGEMMOutInMapBwd.has_value();
+    bool               sorted   = mIGEMMReorderLoc.has_value();
+    bool               training = mIGEMMOutInMapBwd.has_value();
     int splitMaskNum = mIGEMMReorderLoc.has_value() ? mIGEMMReorderLoc.value().size(0) : 1;
 
     int outInMapSize = (mSourceGrid.total_voxels() + 128 - 1) / 128 * 128;
     int kernelVolume = mKernelSize.value().x() * mKernelSize.value().y() * mKernelSize.value().z();
 
-    ret.mIGEMMOutInMap = torch::full(
-            {outInMapSize, kernelVolume}, -1,
-            torch::TensorOptions().dtype(torch::kInt32).device(mSourceGrid.device()));
-    detail::ops::dispatchTransposeOutInMap<torch::kCUDA>(
-        mIGEMMOutInMap.value(), ret.mIGEMMOutInMap.value());
+    ret.mIGEMMOutInMap =
+        torch::full({ outInMapSize, kernelVolume }, -1,
+                    torch::TensorOptions().dtype(torch::kInt32).device(mSourceGrid.device()));
+    detail::ops::dispatchTransposeOutInMap<torch::kCUDA>(mIGEMMOutInMap.value(),
+                                                         ret.mIGEMMOutInMap.value());
 
     if (sorted) {
         if (training) {
-            ret.mIGEMMOutInMapBwd = mIGEMMOutInMap;
+            ret.mIGEMMOutInMapBwd        = mIGEMMOutInMap;
             ret.mIGEMMReorderOutInMapBwd = mIGEMMReoderOutInMap;
-            ret.mIGEMMReorderLocBwd = mIGEMMReorderLoc;
-            torch::Tensor sortedMaskBwd = mIGEMMSortedMask.value();
-            ret.mIGEMMSortedMaskBwdW = detail::ops::dispatchReduceMask<torch::kCUDA>(
-                sortedMaskBwd, 64);
-            ret.mIGEMMSortedMaskBwdD = detail::ops::dispatchReduceMask<torch::kCUDA>(
-                sortedMaskBwd, 128);
+            ret.mIGEMMReorderLocBwd      = mIGEMMReorderLoc;
+            torch::Tensor sortedMaskBwd  = mIGEMMSortedMask.value();
+            ret.mIGEMMSortedMaskBwdW =
+                detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 64);
+            ret.mIGEMMSortedMaskBwdD =
+                detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 128);
         }
         torch::Tensor bitmask = detail::ops::dispatchBitmaskFromOutInMap<torch::kCUDA>(
             ret.mIGEMMOutInMap.value(), splitMaskNum, mSourceGrid.total_voxels());
-        auto rets = torch::sort(bitmask, -1L, true);
-        ret.mIGEMMSortedMask = std::get<0>(rets);
-        ret.mIGEMMReorderLoc = std::get<1>(rets).to(torch::kInt32);
+        auto rets                = torch::sort(bitmask, -1L, true);
+        ret.mIGEMMSortedMask     = std::get<0>(rets);
+        ret.mIGEMMReorderLoc     = std::get<1>(rets).to(torch::kInt32);
         ret.mIGEMMReoderOutInMap = detail::ops::dispatchReorderOutInMap<torch::kCUDA>(
             ret.mIGEMMOutInMap.value(), ret.mIGEMMReorderLoc.value());
-        ret.mIGEMMReducedSortedMask = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            ret.mIGEMMSortedMask.value(), 128);
+        ret.mIGEMMReducedSortedMask =
+            detail::ops::dispatchReduceMask<torch::kCUDA>(ret.mIGEMMSortedMask.value(), 128);
     } else if (training) {
-        int splitMaskNumBwd = mIGEMMReorderLocBwd.value().size(0);
-        ret.mIGEMMOutInMapBwd = mIGEMMOutInMap;
+        int splitMaskNumBwd      = mIGEMMReorderLocBwd.value().size(0);
+        ret.mIGEMMOutInMapBwd    = mIGEMMOutInMap;
         torch::Tensor bitmaskBwd = detail::ops::dispatchBitmaskFromOutInMap<torch::kCUDA>(
             ret.mIGEMMOutInMapBwd.value(), splitMaskNumBwd, mTargetGrid.total_voxels());
-        auto rets = torch::sort(bitmaskBwd, -1L, true);
-        torch::Tensor sortedMaskBwd = std::get<0>(rets);
-        ret.mIGEMMReorderLocBwd = std::get<1>(rets).to(torch::kInt32);
+        auto          rets           = torch::sort(bitmaskBwd, -1L, true);
+        torch::Tensor sortedMaskBwd  = std::get<0>(rets);
+        ret.mIGEMMReorderLocBwd      = std::get<1>(rets).to(torch::kInt32);
         ret.mIGEMMReorderOutInMapBwd = detail::ops::dispatchReorderOutInMap<torch::kCUDA>(
             ret.mIGEMMOutInMapBwd.value(), ret.mIGEMMReorderLocBwd.value());
-        ret.mIGEMMSortedMaskBwdW = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            sortedMaskBwd, 64);
-        ret.mIGEMMSortedMaskBwdD = detail::ops::dispatchReduceMask<torch::kCUDA>(
-            sortedMaskBwd, 128);
+        ret.mIGEMMSortedMaskBwdW = detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 64);
+        ret.mIGEMMSortedMaskBwdD =
+            detail::ops::dispatchReduceMask<torch::kCUDA>(sortedMaskBwd, 128);
     }
     ret.mIGEMMUseTF32 = mIGEMMUseTF32;
     return ret;
 }
 
-void SparseConvPackInfo::buildCutlass(bool benchmark) {
+void
+SparseConvPackInfo::buildCutlass(bool benchmark) {
     if (mCUTLASSHaloIndexBuffer.has_value()) {
-        TORCH_CHECK(mCUTLASSBenchmark == benchmark, "Cutlass is already built with different benchmark flag");
+        TORCH_CHECK(mCUTLASSBenchmark == benchmark,
+                    "Cutlass is already built with different benchmark flag");
         return;
     }
     std::vector<torch::Tensor> res = FVDB_DISPATCH_KERNEL_DEVICE(mSourceGrid.device(), [&]() {
-        return detail::ops::dispatchBrickHaloBuffer<DeviceTag>(
-            *mSourceGrid.impl(), benchmark);
+        return detail::ops::dispatchBrickHaloBuffer<DeviceTag>(*mSourceGrid.impl(), benchmark);
     });
-    mCUTLASSHaloIndexBuffer = res[1];
-    mCUTLASSOutputIndexBuffer = res[2];
-    mCUTLASSBenchmark = benchmark;
+    mCUTLASSHaloIndexBuffer        = res[1];
+    mCUTLASSOutputIndexBuffer      = res[2];
+    mCUTLASSBenchmark              = benchmark;
 }
 
-void SparseConvPackInfo::buildLGGS() {
-    TORCH_CHECK(mKernelSize.value().x() == 3 && mKernelSize.value().y() == 3 && mKernelSize.value().z() == 3,
+void
+SparseConvPackInfo::buildLGGS() {
+    TORCH_CHECK(mKernelSize.value().x() == 3 && mKernelSize.value().y() == 3 &&
+                    mKernelSize.value().z() == 3,
                 "LGGS only supports 3x3x3 kernel size");
-    int outInMapSize = (mTargetGrid.total_voxels() + 64 - 1) / 64 * 64;
-    torch::Tensor outInMap = torch::full(
-        {outInMapSize, 27}, -1,
-        torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
+    int           outInMapSize = (mTargetGrid.total_voxels() + 64 - 1) / 64 * 64;
+    torch::Tensor outInMap =
+        torch::full({ outInMapSize, 27 }, -1,
+                    torch::TensorOptions().dtype(torch::kInt32).device(mTargetGrid.device()));
 
     FVDB_DISPATCH_KERNEL_DEVICE(mSourceGrid.device(), [&]() {
         detail::ops::dispatchConvolutionKernelMap<DeviceTag>(
             *mSourceGrid.impl(), *mTargetGrid.impl(), outInMap, mKernelSize, mStride);
     });
 
-    outInMap = outInMap.view({-1, 64, 27}).transpose(1, 2);     // [#blocks, 27, 64]
+    outInMap              = outInMap.view({ -1, 64, 27 }).transpose(1, 2); // [#blocks, 27, 64]
     torch::Tensor mapMask = outInMap != -1;
-    torch::Tensor mapNNZ = torch::nonzero(mapMask);
+    torch::Tensor mapNNZ  = torch::nonzero(mapMask);
 
     torch::Tensor kernelRanges = mapMask.sum(-1).view(-1).cumsum(0);
-    kernelRanges = torch::cat({torch::zeros(1, kernelRanges.options()), kernelRanges}, 0);
+    kernelRanges = torch::cat({ torch::zeros(1, kernelRanges.options()), kernelRanges }, 0);
 
-    torch::Tensor relOutIndices = mapNNZ.index({torch::indexing::Slice(), -1});
-    torch::Tensor inIndices = outInMap.index({mapNNZ.index({torch::indexing::Slice(), 0}),
-                                              mapNNZ.index({torch::indexing::Slice(), 1}),
-                                              mapNNZ.index({torch::indexing::Slice(), 2})});
+    torch::Tensor relOutIndices = mapNNZ.index({ torch::indexing::Slice(), -1 });
+    torch::Tensor inIndices     = outInMap.index({ mapNNZ.index({ torch::indexing::Slice(), 0 }),
+                                                   mapNNZ.index({ torch::indexing::Slice(), 1 }),
+                                                   mapNNZ.index({ torch::indexing::Slice(), 2 }) });
 
     mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData = relOutIndices.to(torch::kInt32);
-    mLGGSSpokeInputGlobalIndicesFlattenedData = inIndices;
-    mLGGSSpokeIndicesFlattenedOffset = kernelRanges.to(torch::kInt32);
+    mLGGSSpokeInputGlobalIndicesFlattenedData                = inIndices;
+    mLGGSSpokeIndicesFlattenedOffset                         = kernelRanges.to(torch::kInt32);
 }
 
-JaggedTensor SparseConvPackInfo::sparseConv3d(const JaggedTensor& input, const torch::Tensor& weights, ConvPackBackend backend) const {
-    TORCH_CHECK_VALUE(input.num_outer_lists()  == mSourceGrid.grid_count(), "Input batch size must match target grid batch size");
-    TORCH_CHECK_VALUE(input.element_count() == mSourceGrid.total_voxels(), "Input element count must match target grid total voxels");
+JaggedTensor
+SparseConvPackInfo::sparseConv3d(const JaggedTensor &input, const torch::Tensor &weights,
+                                 ConvPackBackend backend) const {
+    TORCH_CHECK_VALUE(input.num_outer_lists() == mSourceGrid.grid_count(),
+                      "Input batch size must match target grid batch size");
+    TORCH_CHECK_VALUE(input.element_count() == mSourceGrid.total_voxels(),
+                      "Input element count must match target grid total voxels");
 
     if (backend == ConvPackBackend::GATHER_SCATTER) {
         auto ret = detail::autograd::SparseConvolutionKernelMap::apply(
@@ -242,41 +258,43 @@ JaggedTensor SparseConvPackInfo::sparseConv3d(const JaggedTensor& input, const t
         // Re-shape kernel from [Do, Di, D, H, W] to [Do, D, H, W, Di].
         TORCH_CHECK(mCUTLASSHaloIndexBuffer.has_value() && mCUTLASSOutputIndexBuffer.has_value(),
                     "Cutlass buffer is not built");
-        auto kernel = weights.permute({0, 4, 3, 2, 1}).contiguous();
+        auto          kernel = weights.permute({ 0, 4, 3, 2, 1 }).contiguous();
         torch::Tensor out = FVDB_DISPATCH_KERNEL_DEVICE(mCUTLASSHaloIndexBuffer->device(), [&]() {
             return detail::ops::dispatchSparseConvolutionCutlass<DeviceTag>(
-                input.jdata(), kernel,
-                mCUTLASSHaloIndexBuffer.value(), mCUTLASSOutputIndexBuffer.value(),
-                mCUTLASSBenchmark);
+                input.jdata(), kernel, mCUTLASSHaloIndexBuffer.value(),
+                mCUTLASSOutputIndexBuffer.value(), mCUTLASSBenchmark);
         });
         return mTargetGrid.impl()->jaggedTensor(out, false);
     } else if (backend == ConvPackBackend::LGGS) {
         TORCH_CHECK(mLGGSSpokeIndicesFlattenedOffset.has_value() &&
-                    mLGGSSpokeInputGlobalIndicesFlattenedData.has_value() &&
-                    mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData.has_value(),
+                        mLGGSSpokeInputGlobalIndicesFlattenedData.has_value() &&
+                        mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData.has_value(),
                     "LGGS buffer is not built");
 
         // Reshape kernel from [Do, Di, D, H, W] to [WHD, Di, Do].
-        auto kernel = weights.permute({4, 3, 2, 1, 0}).contiguous();
-        kernel = kernel.reshape({-1, kernel.size(3), kernel.size(4)});
-        torch::Tensor out = FVDB_DISPATCH_KERNEL_DEVICE(mLGGSSpokeIndicesFlattenedOffset->device(), [&]() {
-            return detail::ops::dispatchSparseConvolutionLggs<DeviceTag>(
-                input.jdata(), kernel,
-                mLGGSSpokeIndicesFlattenedOffset.value(),
-                mLGGSSpokeInputGlobalIndicesFlattenedData.value(),
-                mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData.value());
-        });
+        auto kernel = weights.permute({ 4, 3, 2, 1, 0 }).contiguous();
+        kernel      = kernel.reshape({ -1, kernel.size(3), kernel.size(4) });
+        torch::Tensor out =
+            FVDB_DISPATCH_KERNEL_DEVICE(mLGGSSpokeIndicesFlattenedOffset->device(), [&]() {
+                return detail::ops::dispatchSparseConvolutionLggs<DeviceTag>(
+                    input.jdata(), kernel, mLGGSSpokeIndicesFlattenedOffset.value(),
+                    mLGGSSpokeInputGlobalIndicesFlattenedData.value(),
+                    mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData.value());
+            });
         return mTargetGrid.impl()->jaggedTensor(out, false);
 
     } else {
         TORCH_CHECK(false, "Unknown backend");
     }
-
 }
 
-JaggedTensor SparseConvPackInfo::sparseTransposeConv3d(const JaggedTensor& input, const torch::Tensor& weights, ConvPackBackend backend) const {
-    TORCH_CHECK_VALUE(input.num_outer_lists()  == mTargetGrid.grid_count(), "Input batch size must match target grid batch size");
-    TORCH_CHECK_VALUE(input.element_count() == mTargetGrid.total_voxels(), "Input element count must match target grid total voxels");
+JaggedTensor
+SparseConvPackInfo::sparseTransposeConv3d(const JaggedTensor &input, const torch::Tensor &weights,
+                                          ConvPackBackend backend) const {
+    TORCH_CHECK_VALUE(input.num_outer_lists() == mTargetGrid.grid_count(),
+                      "Input batch size must match target grid batch size");
+    TORCH_CHECK_VALUE(input.element_count() == mTargetGrid.total_voxels(),
+                      "Input element count must match target grid total voxels");
 
     if (backend == ConvPackBackend::GATHER_SCATTER) {
         auto ret = detail::autograd::SparseConvolutionKernelMap::apply(
@@ -293,8 +311,6 @@ JaggedTensor SparseConvPackInfo::sparseTransposeConv3d(const JaggedTensor& input
     } else {
         TORCH_CHECK(false, "Unknown backend");
     }
-
 }
 
-
 } // namespace fvdb
diff --git a/fvdb/src/SparseConvPackInfo.h b/fvdb/src/SparseConvPackInfo.h
index 2dfe0e01e7..3adcdc11e5 100644
--- a/fvdb/src/SparseConvPackInfo.h
+++ b/fvdb/src/SparseConvPackInfo.h
@@ -1,11 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_SPARSECONVPACKINFO_H
+#define FVDB_SPARSECONVPACKINFO_H
 
 #include "GridBatch.h"
 
-
 namespace fvdb {
 
 enum ConvPackBackend {
@@ -16,7 +16,6 @@ enum ConvPackBackend {
 };
 
 class SparseConvPackInfo : torch::CustomClassHolder {
-
     // #IO: Number of input-output pairs
     // #O-P: Number of output voxels, padded to multiple of 128
     // #I-P: Number of input voxels, padded to multiple of 128
@@ -24,29 +23,40 @@ class SparseConvPackInfo : torch::CustomClassHolder {
     // S: Split count
 
     bool mGSUseME = false;
-    torch::optional<torch::Tensor> mGSNeighborMap;          // [#IO, 2] (int32), GATHER_SCATTER, GATHER_SCATTER(me)
-    torch::optional<torch::Tensor> mGSNeighborSizes;        // [#IO, 2] (int32), GATHER_SCATTER, GATHER_SCATTER(me)
-
-    bool mIGEMMUseTF32 = false;
-    torch::optional<torch::Tensor> mIGEMMOutInMap;          // [#O-P, K] (int32), IGEMM, IGEMM(sorted)
-    torch::optional<torch::Tensor> mIGEMMReorderLoc;        // [S, #O-P] (int32), IGEMM(sorted)
-    torch::optional<torch::Tensor> mIGEMMSortedMask;        // [S, #O-P] (int32), IGEMM(sorted)
+    torch::optional<torch::Tensor>
+        mGSNeighborMap;   // [#IO, 2] (int32), GATHER_SCATTER, GATHER_SCATTER(me)
+    torch::optional<torch::Tensor>
+        mGSNeighborSizes; // [#IO, 2] (int32), GATHER_SCATTER, GATHER_SCATTER(me)
+
+    bool                           mIGEMMUseTF32 = false;
+    torch::optional<torch::Tensor> mIGEMMOutInMap;   // [#O-P, K] (int32), IGEMM, IGEMM(sorted)
+    torch::optional<torch::Tensor> mIGEMMReorderLoc; // [S, #O-P] (int32), IGEMM(sorted)
+    torch::optional<torch::Tensor> mIGEMMSortedMask; // [S, #O-P] (int32), IGEMM(sorted)
     torch::optional<torch::Tensor> mIGEMMReducedSortedMask; // [S, #O-P//128] (int32), IGEMM(sorted)
     torch::optional<torch::Tensor> mIGEMMReoderOutInMap;    // [#O-P, K] (int32), IGEMM(sorted)
 
-    torch::optional<torch::Tensor> mIGEMMOutInMapBwd;       // [#I-P, K] (int32), IGEMM, IGEMM(sorted, training)
-    torch::optional<torch::Tensor> mIGEMMReorderLocBwd;     // [S, #I-P] (int32), IGEMM, IGEMM(sorted, training)
-    torch::optional<torch::Tensor> mIGEMMSortedMaskBwdW;    // [S, #I-P//x] (int32), IGEMM, IGEMM(sorted, training)
-    torch::optional<torch::Tensor> mIGEMMSortedMaskBwdD;    // [S, #I-P//y] (int32), IGEMM, IGEMM(sorted, training)
-    torch::optional<torch::Tensor> mIGEMMReorderOutInMapBwd; // [#I-P, K] (int32), IGEMM, IGEMM(sorted, training)
+    torch::optional<torch::Tensor>
+        mIGEMMOutInMapBwd;        // [#I-P, K] (int32), IGEMM, IGEMM(sorted, training)
+    torch::optional<torch::Tensor>
+        mIGEMMReorderLocBwd;      // [S, #I-P] (int32), IGEMM, IGEMM(sorted, training)
+    torch::optional<torch::Tensor>
+        mIGEMMSortedMaskBwdW;     // [S, #I-P//x] (int32), IGEMM, IGEMM(sorted, training)
+    torch::optional<torch::Tensor>
+        mIGEMMSortedMaskBwdD;     // [S, #I-P//y] (int32), IGEMM, IGEMM(sorted, training)
+    torch::optional<torch::Tensor>
+        mIGEMMReorderOutInMapBwd; // [#I-P, K] (int32), IGEMM, IGEMM(sorted, training)
 
     bool mCUTLASSBenchmark = false;
-    torch::optional<torch::Tensor> mCUTLASSHaloIndexBuffer;   // [#active_brick, 6, 4, 4] (int32), CUTLASS
-    torch::optional<torch::Tensor> mCUTLASSOutputIndexBuffer; // [#active_brick, 4, 2, 2] (int32), CUTLASS
+    torch::optional<torch::Tensor>
+        mCUTLASSHaloIndexBuffer;   // [#active_brick, 6, 4, 4] (int32), CUTLASS
+    torch::optional<torch::Tensor>
+        mCUTLASSOutputIndexBuffer; // [#active_brick, 4, 2, 2] (int32), CUTLASS
 
-    torch::optional<torch::Tensor> mLGGSSpokeIndicesFlattenedOffset;    // 1D array. (int32), LGGS
-    torch::optional<torch::Tensor> mLGGSSpokeInputGlobalIndicesFlattenedData;  // 1D array. (int32), LGGS
-    torch::optional<torch::Tensor> mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData; // 1D array. (int32), LGGS
+    torch::optional<torch::Tensor> mLGGSSpokeIndicesFlattenedOffset; // 1D array. (int32), LGGS
+    torch::optional<torch::Tensor>
+        mLGGSSpokeInputGlobalIndicesFlattenedData;                   // 1D array. (int32), LGGS
+    torch::optional<torch::Tensor>
+        mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData;    // 1D array. (int32), LGGS
 
     Vec3iOrScalar mStride;
     Vec3iOrScalar mKernelSize;
@@ -54,37 +64,109 @@ class SparseConvPackInfo : torch::CustomClassHolder {
     GridBatch mSourceGrid;
     GridBatch mTargetGrid;
 
-public:
-    const torch::optional<torch::Tensor> neighborMap() const { return mGSNeighborMap; }
-    const torch::optional<torch::Tensor> neighborSizes() const { return mGSNeighborSizes; }
-    const bool useME() const { return mGSUseME; }
-
-    const torch::optional<torch::Tensor> outInMap() const { return mIGEMMOutInMap; }
-    const torch::optional<torch::Tensor> reorderLoc() const { return mIGEMMReorderLoc; }
-    const torch::optional<torch::Tensor> sortedMask() const { return mIGEMMSortedMask; }
-    const torch::optional<torch::Tensor> reducedSortedMask() const { return mIGEMMReducedSortedMask; }
-    const torch::optional<torch::Tensor> reoderOutInMap() const { return mIGEMMReoderOutInMap; }
-    const bool useTF32() const { return mIGEMMUseTF32; }
-
-    const torch::optional<torch::Tensor> outInMapBwd() const { return mIGEMMOutInMapBwd; }
-    const torch::optional<torch::Tensor> reorderLocBwd() const { return mIGEMMReorderLocBwd; }
-    const torch::optional<torch::Tensor> sortedMaskBwdW() const { return mIGEMMSortedMaskBwdW; }
-    const torch::optional<torch::Tensor> sortedMaskBwdD() const { return mIGEMMSortedMaskBwdD; }
-    const torch::optional<torch::Tensor> reorderOutInMapBwd() const { return mIGEMMReorderOutInMapBwd; }
-
-    const torch::optional<torch::Tensor> haloIndexBuffer() const { return mCUTLASSHaloIndexBuffer; }
-    const torch::optional<torch::Tensor> outputIndexBuffer() const { return mCUTLASSOutputIndexBuffer; }
-    const bool benchmark() const { return mCUTLASSBenchmark; }
-
-    const torch::optional<torch::Tensor> blockKernelRanges() const { return mLGGSSpokeIndicesFlattenedOffset; }
-    const torch::optional<torch::Tensor> blockKernelInIdx() const { return mLGGSSpokeInputGlobalIndicesFlattenedData; }
-    const torch::optional<torch::Tensor> blockKernelRelOutIdx() const { return mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData; }
-
-    const Vec3iOrScalar stride() const { return mStride; }
-    const Vec3iOrScalar kernelSize() const { return mKernelSize; }
-
-    GridBatch targetGrid() const { return mTargetGrid; }
-    GridBatch sourceGrid() const { return mSourceGrid; }
+  public:
+    const torch::optional<torch::Tensor>
+    neighborMap() const {
+        return mGSNeighborMap;
+    }
+    const torch::optional<torch::Tensor>
+    neighborSizes() const {
+        return mGSNeighborSizes;
+    }
+    const bool
+    useME() const {
+        return mGSUseME;
+    }
+
+    const torch::optional<torch::Tensor>
+    outInMap() const {
+        return mIGEMMOutInMap;
+    }
+    const torch::optional<torch::Tensor>
+    reorderLoc() const {
+        return mIGEMMReorderLoc;
+    }
+    const torch::optional<torch::Tensor>
+    sortedMask() const {
+        return mIGEMMSortedMask;
+    }
+    const torch::optional<torch::Tensor>
+    reducedSortedMask() const {
+        return mIGEMMReducedSortedMask;
+    }
+    const torch::optional<torch::Tensor>
+    reoderOutInMap() const {
+        return mIGEMMReoderOutInMap;
+    }
+    const bool
+    useTF32() const {
+        return mIGEMMUseTF32;
+    }
+
+    const torch::optional<torch::Tensor>
+    outInMapBwd() const {
+        return mIGEMMOutInMapBwd;
+    }
+    const torch::optional<torch::Tensor>
+    reorderLocBwd() const {
+        return mIGEMMReorderLocBwd;
+    }
+    const torch::optional<torch::Tensor>
+    sortedMaskBwdW() const {
+        return mIGEMMSortedMaskBwdW;
+    }
+    const torch::optional<torch::Tensor>
+    sortedMaskBwdD() const {
+        return mIGEMMSortedMaskBwdD;
+    }
+    const torch::optional<torch::Tensor>
+    reorderOutInMapBwd() const {
+        return mIGEMMReorderOutInMapBwd;
+    }
+
+    const torch::optional<torch::Tensor>
+    haloIndexBuffer() const {
+        return mCUTLASSHaloIndexBuffer;
+    }
+    const torch::optional<torch::Tensor>
+    outputIndexBuffer() const {
+        return mCUTLASSOutputIndexBuffer;
+    }
+    const bool
+    benchmark() const {
+        return mCUTLASSBenchmark;
+    }
+
+    const torch::optional<torch::Tensor>
+    blockKernelRanges() const {
+        return mLGGSSpokeIndicesFlattenedOffset;
+    }
+    const torch::optional<torch::Tensor>
+    blockKernelInIdx() const {
+        return mLGGSSpokeInputGlobalIndicesFlattenedData;
+    }
+    const torch::optional<torch::Tensor>
+    blockKernelRelOutIdx() const {
+        return mLGGSSpokeOutputLocalOffsetsRelativeToBlockFlattenedData;
+    }
+
+    const Vec3iOrScalar
+    stride() const {
+        return mStride;
+    }
+    const Vec3iOrScalar
+    kernelSize() const {
+        return mKernelSize;
+    }
+
+    GridBatch
+    targetGrid() const {
+        return mTargetGrid;
+    }
+    GridBatch
+    sourceGrid() const {
+        return mSourceGrid;
+    }
 
     SparseConvPackInfo(Vec3iOrScalar kernelsize, Vec3iOrScalar stride, GridBatch src,
                        torch::optional<GridBatch> maybeTarget);
@@ -93,13 +175,18 @@ class SparseConvPackInfo : torch::CustomClassHolder {
 
     // Will not rebuild if already built
     void buildGatherScatter(bool use_me = false);
-    void buildImplicitGEMM(bool sorted, int splitMaskNum, bool training, int splitMaskNumBwd, bool use_tf32 = false);
+    void buildImplicitGEMM(bool sorted, int splitMaskNum, bool training, int splitMaskNumBwd,
+                           bool use_tf32 = false);
     void buildCutlass(bool benchmark = false);
     void buildLGGS();
 
-    JaggedTensor sparseConv3d(const JaggedTensor& input, const torch::Tensor& weights, ConvPackBackend backend = ConvPackBackend::GATHER_SCATTER) const;
-    JaggedTensor sparseTransposeConv3d(const JaggedTensor& input, const torch::Tensor& weights, ConvPackBackend backend = ConvPackBackend::GATHER_SCATTER) const;
+    JaggedTensor sparseConv3d(const JaggedTensor &input, const torch::Tensor &weights,
+                              ConvPackBackend backend = ConvPackBackend::GATHER_SCATTER) const;
+    JaggedTensor
+    sparseTransposeConv3d(const JaggedTensor &input, const torch::Tensor &weights,
+                          ConvPackBackend backend = ConvPackBackend::GATHER_SCATTER) const;
 };
 
+} // namespace fvdb
 
-}
+#endif // FVDB_SPARSECONVPACKINFO_H
\ No newline at end of file
diff --git a/fvdb/src/Types.h b/fvdb/src/Types.h
index c407eff022..6c469f3db8 100644
--- a/fvdb/src/Types.h
+++ b/fvdb/src/Types.h
@@ -1,137 +1,157 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
-#include <torch/all.h>
-#include <c10/cuda/CUDAFunctions.h>
-#include <nanovdb/NanoVDB.h>
+#ifndef FVDB_TYPES_H
+#define FVDB_TYPES_H
 
 #include "detail/TypesImpl.h"
 
+#include <nanovdb/NanoVDB.h>
 
-namespace fvdb {
+#include <c10/cuda/CUDAFunctions.h>
+#include <torch/all.h>
 
+namespace fvdb {
 
-// These are union types that can be constructed from nanovdb types, torch tensors, std::vectors, single scalars, etc...
-// They are used to allow the user to pass in a variety of types to the API, and then convert them to the correct type
-using Vec3i = detail::Coord3Impl<false>;
+// These are union types that can be constructed from nanovdb types, torch tensors, std::vectors,
+// single scalars, etc... They are used to allow the user to pass in a variety of types to the API,
+// and then convert them to the correct type
+using Vec3i         = detail::Coord3Impl<false>;
 using Vec3iOrScalar = detail::Coord3Impl<true>;
-using Vec4i = detail::Coord4Impl<false>;
-using Vec3d = detail::Vec3dImpl<false>;
+using Vec4i         = detail::Coord4Impl<false>;
+using Vec3d         = detail::Vec3dImpl<false>;
 using Vec3dOrScalar = detail::Vec3dImpl<true>;
 
-// These are union types that can be constructed from nanovdb types, torch tensors, std::vectors, single scalars, etc...
-// and resolve to a batch of values. They are used to allow the user to pass in a single vector (or scalar) and have
-// it be broadcast to a whole batch of values.
-// E.g. if you are constructing a batch of grids, you can pass in a single scalar 1.0 to have a voxel size of [1, 1, 1]
-//      for every grid in the batch. Or a user can pass in a vector [1, 2, 3] to have each grid have a voxel
-//       size of [1, 2, 3]. Alternatively, a user can specify a voxel size for each grid in the batch
-//       [[v1x, v1y, v1z], ..., [vnx, vny, vnz]]. The Vec3dBatchOrScalar will accept all these inputs
-//       and resolve them to a batch of values.
-using Vec3dBatchOrScalar = detail::Vec3BatchImpl<nanovdb::Vec3d, true /*AllowScalar*/, true /*AllowBroadcast*/>;
-using Vec3dBatch = detail::Vec3BatchImpl<nanovdb::Vec3d, false /*AllowScalar*/, true /*AllowBroadcast*/>;
-using Vec3iBatch = detail::Vec3BatchImpl<nanovdb::Coord, false /*AllowScalar*/, true /*AllowBroadcast*/>;
-
+// These are union types that can be constructed from nanovdb types, torch tensors, std::vectors,
+// single scalars, etc... and resolve to a batch of values. They are used to allow the user to pass
+// in a single vector (or scalar) and have it be broadcast to a whole batch of values. E.g. if you
+// are constructing a batch of grids, you can pass in a single scalar 1.0 to have a voxel size of
+// [1, 1, 1]
+//      for every grid in the batch. Or a user can pass in a vector [1, 2, 3] to have each grid have
+//      a voxel
+//       size of [1, 2, 3]. Alternatively, a user can specify a voxel size for each grid in the
+//       batch
+//       [[v1x, v1y, v1z], ..., [vnx, vny, vnz]]. The Vec3dBatchOrScalar will accept all these
+//       inputs and resolve them to a batch of values.
+using Vec3dBatchOrScalar =
+    detail::Vec3BatchImpl<nanovdb::Vec3d, true /*AllowScalar*/, true /*AllowBroadcast*/>;
+using Vec3dBatch =
+    detail::Vec3BatchImpl<nanovdb::Vec3d, false /*AllowScalar*/, true /*AllowBroadcast*/>;
+using Vec3iBatch =
+    detail::Vec3BatchImpl<nanovdb::Coord, false /*AllowScalar*/, true /*AllowBroadcast*/>;
 
 /// @brief A class that can be constructed from a torch::Device or a string.
 ///        Calling value() returns a torch::device
 class TorchDeviceOrString {
     torch::Device mValue;
-    void setIndex() {
-        if (mValue.is_cuda() && ! mValue.has_index()) {
+    void
+    setIndex() {
+        if (mValue.is_cuda() && !mValue.has_index()) {
             mValue.set_index(c10::cuda::current_device());
         }
     }
-public:
+
+  public:
     TorchDeviceOrString() : mValue(torch::kCPU) { setIndex(); }
     TorchDeviceOrString(torch::Device device) : mValue(device) { setIndex(); }
     TorchDeviceOrString(c10::DeviceType deviceType) : mValue(deviceType) { setIndex(); }
-    TorchDeviceOrString(std::string& str) : mValue(str) { setIndex(); }
+    TorchDeviceOrString(std::string &str) : mValue(str) { setIndex(); }
 
-    const torch::Device& value() const {
+    const torch::Device &
+    value() const {
         return mValue;
     }
 };
 
-
-/// @brief A class that con be constructed from a string or a list of strings but always returns a list of strings
-///        Used to enable broadcasting for arguments that specify a single value or a list of values for a whole batch
+/// @brief A class that con be constructed from a string or a list of strings but always returns a
+/// list of strings
+///        Used to enable broadcasting for arguments that specify a single value or a list of values
+///        for a whole batch
 class StringOrListOfStrings {
     std::vector<std::string> mValue;
-public:
+
+  public:
     StringOrListOfStrings() : mValue() {}
-    StringOrListOfStrings(std::string str) : mValue({str}) {}
+    StringOrListOfStrings(std::string str) : mValue({ str }) {}
     StringOrListOfStrings(std::vector<std::string> str) : mValue(str) {}
 
-    const std::vector<std::string>& value() const {
+    const std::vector<std::string> &
+    value() const {
         return mValue;
     }
 };
 
-
-/// @brief A class representing a set of unique IDs for a nanovdb grid (used to specify which grids to load
-///        from an .nvdb file). You can specify the set of grids to load as a integer index, a single string name,
-///        a vector of integer indices, or a vector of string names
+/// @brief A class representing a set of unique IDs for a nanovdb grid (used to specify which grids
+/// to load
+///        from an .nvdb file). You can specify the set of grids to load as a integer index, a
+///        single string name, a vector of integer indices, or a vector of string names
 class NanoVDBFileGridIdentifier {
-    std::vector<uint64_t> mIndices;
+    std::vector<uint64_t>    mIndices;
     std::vector<std::string> mGridNames;
 
-public:
+  public:
     NanoVDBFileGridIdentifier() : mIndices(), mGridNames() {};
-    NanoVDBFileGridIdentifier(uint64_t index) : mIndices({index}) {};
+    NanoVDBFileGridIdentifier(uint64_t index) : mIndices({ index }) {};
     NanoVDBFileGridIdentifier(std::vector<uint64_t> indices) : mIndices(indices) {};
-    NanoVDBFileGridIdentifier(std::string gridName) : mGridNames({gridName}) {};
+    NanoVDBFileGridIdentifier(std::string gridName) : mGridNames({ gridName }) {};
     NanoVDBFileGridIdentifier(std::vector<std::string> gridNames) : mGridNames(gridNames) {};
 
-    std::string toString() const {
+    std::string
+    toString() const {
         std::stringstream ss;
         if (specifiesIndices()) {
-            for(auto idx : mIndices) {
+            for (auto idx: mIndices) {
                 ss << idx << ", ";
             }
             return "NanoVDBFileGridIdentifier indices: " + ss.str();
         } else {
-            for(auto idx : mGridNames) {
+            for (auto idx: mGridNames) {
                 ss << idx << ", ";
             }
             return "NanoVDBFileGridIdentifier gridNames: " + ss.str();
         }
     }
 
-    bool isValid() const {
+    bool
+    isValid() const {
         return (mIndices.empty() != mGridNames.empty());
     }
 
-    bool specifiesIndices() const {
+    bool
+    specifiesIndices() const {
         return !mIndices.empty();
     }
 
-    bool specifiesNames() const {
+    bool
+    specifiesNames() const {
         return !mGridNames.empty();
     }
 
-    const std::vector<uint64_t>& indicesValue() const {
+    const std::vector<uint64_t> &
+    indicesValue() const {
         return mIndices;
     }
 
-    const std::vector<std::string>& namesValue() const {
+    const std::vector<std::string> &
+    namesValue() const {
         return mGridNames;
     }
 
-    bool empty() const {
+    bool
+    empty() const {
         return (mIndices.empty() && mGridNames.empty());
     }
 
-    size_t size() const {
+    size_t
+    size() const {
         if (specifiesIndices()) {
             return mIndices.size();
         } else {
             return mGridNames.size();
         }
     }
-
 };
 
+} // namespace fvdb
 
-} // namespace fvdb
\ No newline at end of file
+#endif // FVDB_TYPES_H
\ No newline at end of file
diff --git a/fvdb/src/detail/GridBatchImpl.cu b/fvdb/src/detail/GridBatchImpl.cu
index dd200f6aa2..0159c991a8 100644
--- a/fvdb/src/detail/GridBatchImpl.cu
+++ b/fvdb/src/detail/GridBatchImpl.cu
@@ -3,34 +3,34 @@
 //
 #include "GridBatchImpl.h"
 
-#include <algorithm>
+#include <detail/build/Build.h>
+#include <detail/ops/Ops.h>
 
 #include <nanovdb/cuda/GridHandle.cuh>
 
+#include <c10/core/DeviceType.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <c10/core/DeviceType.h>
 
-#include "detail/ops/Ops.h"
-#include "detail/build/Build.h"
+#include <algorithm>
 
 namespace {
 
-__global__ void computeBatchOffsetsFromMetadata(
-    uint32_t numGrids,
-    fvdb::detail::GridBatchImpl::GridMetadata* perGridMetadata,
-    torch::PackedTensorAccessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits> outBatchOffsets) {
-
+__global__ void
+computeBatchOffsetsFromMetadata(
+    uint32_t numGrids, fvdb::detail::GridBatchImpl::GridMetadata *perGridMetadata,
+    torch::PackedTensorAccessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>
+        outBatchOffsets) {
     if (numGrids == 0) {
         return;
     }
     outBatchOffsets[0] = 0;
     for (uint32_t i = 1; i < (numGrids + 1); i += 1) {
-        outBatchOffsets[i] = outBatchOffsets[i-1] + perGridMetadata[i-1].mNumVoxels;
+        outBatchOffsets[i] = outBatchOffsets[i - 1] + perGridMetadata[i - 1].mNumVoxels;
     }
 }
 
-}
+} // namespace
 
 namespace fvdb {
 namespace detail {
@@ -39,40 +39,49 @@ GridBatchImpl::GridBatchImpl(torch::Device device, bool isMutable) {
     std::vector<nanovdb::Vec3d> dummy;
     dummy.push_back(nanovdb::Vec3d(1.0, 1.0, 1.0));
     // TODO (Francis): No list-of-lists support for now, so we just pass an empty list of indices
-    const torch::Tensor lidx = torch::empty({0, 1}, torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(device));
+    const torch::Tensor lidx =
+        torch::empty({ 0, 1 }, torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(device));
     setGrid(build::buildEmptyGrid(device, isMutable), lidx, dummy, dummy, false);
     mHostGridMetadata.clear();
     syncMetadataToDeviceIfCUDA(false);
     mBatchMetadata.mIsContiguous = true;
 }
 
-GridBatchImpl::GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                             const std::vector<nanovdb::Vec3d>& voxelSizes,
-                             const std::vector<nanovdb::Vec3d>& voxelOrigins) {
-    TORCH_CHECK(!gridHdl.buffer().isEmpty(), "Cannot create a batched grid handle from an empty grid handle");
+GridBatchImpl::GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl,
+                             const std::vector<nanovdb::Vec3d>       &voxelSizes,
+                             const std::vector<nanovdb::Vec3d>       &voxelOrigins) {
+    TORCH_CHECK(!gridHdl.buffer().isEmpty(),
+                "Cannot create a batched grid handle from an empty grid handle");
     for (std::size_t i = 0; i < voxelSizes.size(); i += 1) {
-        TORCH_CHECK_VALUE(voxelSizes[i][0] > 0 && voxelSizes[i][1] > 0 && voxelSizes[i][2] > 0, "Voxel size must be greater than 0");
+        TORCH_CHECK_VALUE(voxelSizes[i][0] > 0 && voxelSizes[i][1] > 0 && voxelSizes[i][2] > 0,
+                          "Voxel size must be greater than 0");
     }
     mDeviceGridMetadata = nullptr;
     // TODO (Francis): No list-of-lists support for now, so we just pass an empty list of indices
-    const torch::Tensor lidx = torch::empty({0, 1}, torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(gridHdl.buffer().device()));
+    const torch::Tensor lidx = torch::empty(
+        { 0, 1 },
+        torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(gridHdl.buffer().device()));
     setGrid(std::move(gridHdl), lidx, voxelSizes, voxelOrigins, false /* blocking */);
     mBatchMetadata.mIsContiguous = true;
 };
 
-GridBatchImpl::GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                             const nanovdb::Vec3d& globalVoxelSize,
-                             const nanovdb::Vec3d& globalVoxelOrigin) {
-    TORCH_CHECK(!gridHdl.buffer().isEmpty(), "Cannot create a batched grid handle from an empty grid handle");
-    TORCH_CHECK_VALUE(globalVoxelSize[0] > 0 && globalVoxelSize[1] > 0 && globalVoxelSize[2] > 0, "Voxel size must be greater than 0");
+GridBatchImpl::GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl,
+                             const nanovdb::Vec3d                    &globalVoxelSize,
+                             const nanovdb::Vec3d                    &globalVoxelOrigin) {
+    TORCH_CHECK(!gridHdl.buffer().isEmpty(),
+                "Cannot create a batched grid handle from an empty grid handle");
+    TORCH_CHECK_VALUE(globalVoxelSize[0] > 0 && globalVoxelSize[1] > 0 && globalVoxelSize[2] > 0,
+                      "Voxel size must be greater than 0");
     mDeviceGridMetadata = nullptr;
     std::vector<nanovdb::Vec3d> voxelSizes, voxelOrigins;
-    for(size_t i = 0; i < gridHdl.gridCount(); ++i) {
+    for (size_t i = 0; i < gridHdl.gridCount(); ++i) {
         voxelSizes.push_back(globalVoxelSize);
         voxelOrigins.push_back(globalVoxelOrigin);
     }
     // TODO (Francis): No list-of-lists support for now, so we just pass an empty list of indices
-    const torch::Tensor lidx = torch::empty({0, 1}, torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(gridHdl.buffer().device()));
+    const torch::Tensor lidx = torch::empty(
+        { 0, 1 },
+        torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(gridHdl.buffer().device()));
     setGrid(std::move(gridHdl), lidx, voxelSizes, voxelOrigins, false /* blocking */);
     mBatchMetadata.mIsContiguous = true;
 };
@@ -84,13 +93,15 @@ GridBatchImpl::~GridBatchImpl() {
     }
 };
 
-torch::Tensor GridBatchImpl::worldToGridMatrix(int64_t bi) const {
+torch::Tensor
+GridBatchImpl::worldToGridMatrix(int64_t bi) const {
     bi = negativeToPositiveIndexWithRangecheck(bi);
 
-    torch::Tensor xformMat = torch::eye(4, torch::TensorOptions().device(device()).dtype(torch::kDouble));
-    const VoxelCoordTransform& transform = primalTransform(bi);
-    const nanovdb::Vec3d& scale = transform.scale<double>();
-    const nanovdb::Vec3d& translate = transform.translate<double>();
+    torch::Tensor xformMat =
+        torch::eye(4, torch::TensorOptions().device(device()).dtype(torch::kDouble));
+    const VoxelCoordTransform &transform = primalTransform(bi);
+    const nanovdb::Vec3d      &scale     = transform.scale<double>();
+    const nanovdb::Vec3d      &translate = transform.translate<double>();
 
     xformMat[0][0] = scale[0];
     xformMat[1][1] = scale[1];
@@ -103,41 +114,49 @@ torch::Tensor GridBatchImpl::worldToGridMatrix(int64_t bi) const {
     return xformMat;
 }
 
-void GridBatchImpl::recomputeBatchOffsets() {
+void
+GridBatchImpl::recomputeBatchOffsets() {
     TORCH_CHECK(batchSize() == mHostGridMetadata.size(), "Batch size does not match metadata size");
-    mBatchOffsets = torch::empty({batchSize() + 1}, torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(device()));
+    mBatchOffsets =
+        torch::empty({ batchSize() + 1 },
+                     torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(device()));
     if (device().is_cuda()) {
-        computeBatchOffsetsFromMetadata<<<1, 1>>>(batchSize(), mDeviceGridMetadata, mBatchOffsets.packed_accessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>());
+        computeBatchOffsetsFromMetadata<<<1, 1>>>(
+            batchSize(), mDeviceGridMetadata,
+            mBatchOffsets.packed_accessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     } else {
         auto outBatchOffsets = mBatchOffsets.accessor<fvdb::JOffsetsType, 1>();
-        outBatchOffsets[0] = 0;
+        outBatchOffsets[0]   = 0;
         for (int i = 1; i < (mHostGridMetadata.size() + 1); i += 1) {
-            outBatchOffsets[i] = outBatchOffsets[i-1] + mHostGridMetadata[i-1].mNumVoxels;
+            outBatchOffsets[i] = outBatchOffsets[i - 1] + mHostGridMetadata[i - 1].mNumVoxels;
         }
     }
 }
 
-
-torch::Tensor GridBatchImpl::gridToWorldMatrix(int64_t bi) const {
+torch::Tensor
+GridBatchImpl::gridToWorldMatrix(int64_t bi) const {
     bi = negativeToPositiveIndexWithRangecheck(bi);
     return torch::linalg::inv(worldToGridMatrix(bi));
 }
 
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::clone(torch::Device device, bool blocking) const {
-    // If you're cloning an empty grid, just create a new empty grid on the right device and return it
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::clone(torch::Device device, bool blocking) const {
+    // If you're cloning an empty grid, just create a new empty grid on the right device and return
+    // it
     if (batchSize() == 0) {
         return c10::make_intrusive<GridBatchImpl>(device, isMutable());
     }
 
-    // The guide buffer is a hack to perform the correct copy (i.e. host -> device / device -> host etc...)
-    // The guide carries the desired target device to the copy.
-    // The reason we do this is to conform with the nanovdb which can only accept a buffer as an extra argument.
+    // The guide buffer is a hack to perform the correct copy (i.e. host -> device / device -> host
+    // etc...) The guide carries the desired target device to the copy. The reason we do this is to
+    // conform with the nanovdb which can only accept a buffer as an extra argument.
     TorchDeviceBuffer guideBuffer(0, nullptr);
     guideBuffer.setDevice(device, true);
 
     // Make a copy of this gridHandle on the same device as the guide buffer
-    nanovdb::GridHandle<TorchDeviceBuffer> clonedHdl = mGridHdl->copy<TorchDeviceBuffer>(guideBuffer);
+    nanovdb::GridHandle<TorchDeviceBuffer> clonedHdl =
+        mGridHdl->copy<TorchDeviceBuffer>(guideBuffer);
 
     // Copy the voxel sizes and origins for this grid
     std::vector<nanovdb::Vec3d> voxelSizes, voxelOrigins;
@@ -145,28 +164,31 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::clone(torch::Device device, boo
 
     // Build a GridBatchImpl from the cloned grid handle and voxel sizes/origins
     // FIXME: (@fwilliams) This makes an extra copy or non contiguous grids
-    return GridBatchImpl::contiguous(c10::make_intrusive<GridBatchImpl>(std::move(clonedHdl), voxelSizes, voxelOrigins));
+    return GridBatchImpl::contiguous(
+        c10::make_intrusive<GridBatchImpl>(std::move(clonedHdl), voxelSizes, voxelOrigins));
 }
 
-void GridBatchImpl::syncMetadataToDeviceIfCUDA(bool blocking) {
+void
+GridBatchImpl::syncMetadataToDeviceIfCUDA(bool blocking) {
     if (device().is_cuda()) { // There is something to sync and we're on a cuda device
 
         // We haven't allocated the cuda memory yet, so we need to do that now
         if (mDeviceGridMetadata == nullptr) {
             // We need to allocate the memory on the device
             c10::cuda::CUDAGuard deviceGuard(device());
-            size_t metaDataByteSize = sizeof(GridMetadata) * mHostGridMetadata.size();
+            size_t               metaDataByteSize = sizeof(GridMetadata) * mHostGridMetadata.size();
             at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(device().index());
-            mDeviceGridMetadata = static_cast<GridMetadata*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(metaDataByteSize, defaultStream.stream()));
+            mDeviceGridMetadata =
+                static_cast<GridMetadata *>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(
+                    metaDataByteSize, defaultStream.stream()));
         }
 
         // Copy host grid metadata to device buffer
-        size_t metaDataByteSize = sizeof(GridMetadata) * mHostGridMetadata.size();
-        at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(mGridHdl->buffer().device().index());
-        C10_CUDA_CHECK(cudaMemcpyAsync(mDeviceGridMetadata,
-                                       mHostGridMetadata.data(),
-                                       metaDataByteSize,
-                                       cudaMemcpyHostToDevice,
+        size_t               metaDataByteSize = sizeof(GridMetadata) * mHostGridMetadata.size();
+        at::cuda::CUDAStream defaultStream =
+            at::cuda::getCurrentCUDAStream(mGridHdl->buffer().device().index());
+        C10_CUDA_CHECK(cudaMemcpyAsync(mDeviceGridMetadata, mHostGridMetadata.data(),
+                                       metaDataByteSize, cudaMemcpyHostToDevice,
                                        defaultStream.stream()));
         // Block if you asked for it
         if (blocking) {
@@ -175,7 +197,8 @@ void GridBatchImpl::syncMetadataToDeviceIfCUDA(bool blocking) {
     }
 }
 
-void GridBatchImpl::setGlobalPrimalTransform(const VoxelCoordTransform& transform, bool syncToDevice) {
+void
+GridBatchImpl::setGlobalPrimalTransform(const VoxelCoordTransform &transform, bool syncToDevice) {
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
         mHostGridMetadata[i].mPrimalTransform = transform;
     }
@@ -185,7 +208,8 @@ void GridBatchImpl::setGlobalPrimalTransform(const VoxelCoordTransform& transfor
     }
 }
 
-void GridBatchImpl::setGlobalDualTransform(const VoxelCoordTransform& transform, bool syncToDevice) {
+void
+GridBatchImpl::setGlobalDualTransform(const VoxelCoordTransform &transform, bool syncToDevice) {
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
         mHostGridMetadata[i].mDualTransform = transform;
     }
@@ -195,7 +219,8 @@ void GridBatchImpl::setGlobalDualTransform(const VoxelCoordTransform& transform,
     }
 }
 
-void GridBatchImpl::setGlobalVoxelSize(const nanovdb::Vec3d& voxelSize, bool syncToDevice) {
+void
+GridBatchImpl::setGlobalVoxelSize(const nanovdb::Vec3d &voxelSize, bool syncToDevice) {
     TORCH_CHECK(batchSize() > 0, "Cannot set global voxel size on an empty batch of grids");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
@@ -207,7 +232,8 @@ void GridBatchImpl::setGlobalVoxelSize(const nanovdb::Vec3d& voxelSize, bool syn
     }
 }
 
-void GridBatchImpl::setGlobalVoxelOrigin(const nanovdb::Vec3d& voxelOrigin, bool syncToDevice) {
+void
+GridBatchImpl::setGlobalVoxelOrigin(const nanovdb::Vec3d &voxelOrigin, bool syncToDevice) {
     TORCH_CHECK(batchSize() > 0, "Cannot set global voxel origin on an empty batch of grids");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
@@ -219,8 +245,11 @@ void GridBatchImpl::setGlobalVoxelOrigin(const nanovdb::Vec3d& voxelOrigin, bool
     }
 }
 
-void GridBatchImpl::setGlobalVoxelSizeAndOrigin(const nanovdb::Vec3d& voxelSize, const nanovdb::Vec3d& voxelOrigin, bool syncToDevice) {
-    TORCH_CHECK(batchSize() > 0, "Cannot set global voxel size and origin on an empty batch of grids");
+void
+GridBatchImpl::setGlobalVoxelSizeAndOrigin(const nanovdb::Vec3d &voxelSize,
+                                           const nanovdb::Vec3d &voxelOrigin, bool syncToDevice) {
+    TORCH_CHECK(batchSize() > 0,
+                "Cannot set global voxel size and origin on an empty batch of grids");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
         mHostGridMetadata[i].setTransform(voxelSize, voxelOrigin);
@@ -231,9 +260,11 @@ void GridBatchImpl::setGlobalVoxelSizeAndOrigin(const nanovdb::Vec3d& voxelSize,
     }
 }
 
-
-void GridBatchImpl::setFineTransformFromCoarseGrid(const GridBatchImpl& coarseBatch, nanovdb::Coord subdivisionFactor) {
-    TORCH_CHECK(coarseBatch.batchSize() == batchSize(), "Coarse grid batch size must match fine grid batch size");
+void
+GridBatchImpl::setFineTransformFromCoarseGrid(const GridBatchImpl &coarseBatch,
+                                              nanovdb::Coord       subdivisionFactor) {
+    TORCH_CHECK(coarseBatch.batchSize() == batchSize(),
+                "Coarse grid batch size must match fine grid batch size");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
         auto sizeAndOrigin = coarseBatch.fineVoxSizeAndOrigin(i, subdivisionFactor);
@@ -243,9 +274,11 @@ void GridBatchImpl::setFineTransformFromCoarseGrid(const GridBatchImpl& coarseBa
     syncMetadataToDeviceIfCUDA(false);
 }
 
-
-void GridBatchImpl::setCoarseTransformFromFineGrid(const GridBatchImpl& fineBatch, nanovdb::Coord coarseningFactor) {
-    TORCH_CHECK(fineBatch.batchSize() == batchSize(), "Fine grid batch size must match coarse grid batch size");
+void
+GridBatchImpl::setCoarseTransformFromFineGrid(const GridBatchImpl &fineBatch,
+                                              nanovdb::Coord       coarseningFactor) {
+    TORCH_CHECK(fineBatch.batchSize() == batchSize(),
+                "Fine grid batch size must match coarse grid batch size");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
         auto sizeAndOrigin = fineBatch.coarseVoxSizeAndOrigin(i, coarseningFactor);
@@ -255,29 +288,34 @@ void GridBatchImpl::setCoarseTransformFromFineGrid(const GridBatchImpl& fineBatc
     syncMetadataToDeviceIfCUDA(false);
 }
 
-
-void GridBatchImpl::setPrimalTransformFromDualGrid(const GridBatchImpl& dualBatch) {
-    TORCH_CHECK(dualBatch.batchSize() == batchSize(), "Dual grid batch size must match primal grid batch size");
+void
+GridBatchImpl::setPrimalTransformFromDualGrid(const GridBatchImpl &dualBatch) {
+    TORCH_CHECK(dualBatch.batchSize() == batchSize(),
+                "Dual grid batch size must match primal grid batch size");
 
     for (size_t i = 0; i < mHostGridMetadata.size(); i++) {
-        mHostGridMetadata[i].mDualTransform = dualBatch.mHostGridMetadata[i].mPrimalTransform;
+        mHostGridMetadata[i].mDualTransform   = dualBatch.mHostGridMetadata[i].mPrimalTransform;
         mHostGridMetadata[i].mPrimalTransform = dualBatch.mHostGridMetadata[i].mDualTransform;
-        mHostGridMetadata[i].mVoxelSize = dualBatch.mHostGridMetadata[i].mVoxelSize;
+        mHostGridMetadata[i].mVoxelSize       = dualBatch.mHostGridMetadata[i].mVoxelSize;
     }
 
     syncMetadataToDeviceIfCUDA(false);
 }
 
-
-void GridBatchImpl::setGrid(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                            const torch::Tensor listIndices,
-                            const std::vector<nanovdb::Vec3d>& voxelSizes,
-                            const std::vector<nanovdb::Vec3d>& voxelOrigins,
-                            bool blocking) {
+void
+GridBatchImpl::setGrid(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl,
+                       const torch::Tensor                      listIndices,
+                       const std::vector<nanovdb::Vec3d>       &voxelSizes,
+                       const std::vector<nanovdb::Vec3d> &voxelOrigins, bool blocking) {
     TORCH_CHECK(!gridHdl.buffer().isEmpty(), "Empty grid handle");
-    TORCH_CHECK(voxelSizes.size() == gridHdl.gridCount(), "voxelSizes array does not have the same size as the number of grids, got ", voxelSizes.size(), " expected ", gridHdl.gridCount());
-    TORCH_CHECK(voxelOrigins.size() == gridHdl.gridCount(), "Voxel origins must be the same size as the number of grids");
-    TORCH_CHECK((gridHdl.gridType(0) == nanovdb::GridType::OnIndex) || (gridHdl.gridType(0) == nanovdb::GridType::OnIndexMask), "GridBatchImpl only supports ValueOnIndex and ValueOnIndexMask grids");
+    TORCH_CHECK(voxelSizes.size() == gridHdl.gridCount(),
+                "voxelSizes array does not have the same size as the number of grids, got ",
+                voxelSizes.size(), " expected ", gridHdl.gridCount());
+    TORCH_CHECK(voxelOrigins.size() == gridHdl.gridCount(),
+                "Voxel origins must be the same size as the number of grids");
+    TORCH_CHECK((gridHdl.gridType(0) == nanovdb::GridType::OnIndex) ||
+                    (gridHdl.gridType(0) == nanovdb::GridType::OnIndexMask),
+                "GridBatchImpl only supports ValueOnIndex and ValueOnIndexMask grids");
     const torch::Device device = gridHdl.buffer().device();
 
     // Clear out old grid metadata
@@ -292,31 +330,35 @@ void GridBatchImpl::setGrid(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
 
     FVDB_DISPATCH_KERNEL_DEVICE(device, [&]() {
         // Allocate device memory for metadata
-        GridBatchMetadata* deviceBatchMetadataPtr = nullptr;
+        GridBatchMetadata *deviceBatchMetadataPtr = nullptr;
         if constexpr (DeviceTag == torch::kCUDA) {
             c10::cuda::CUDAGuard deviceGuard(device);
-            const size_t metaDataByteSize = sizeof(GridMetadata) * gridHdl.gridCount();
-            at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(device.index());
-            mDeviceGridMetadata = static_cast<GridMetadata*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(metaDataByteSize, defaultStream.stream()));
-            deviceBatchMetadataPtr = static_cast<GridBatchMetadata*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(sizeof(GridBatchMetadata), defaultStream.stream()));
+            const size_t         metaDataByteSize = sizeof(GridMetadata) * gridHdl.gridCount();
+            at::cuda::CUDAStream defaultStream    = at::cuda::getCurrentCUDAStream(device.index());
+            mDeviceGridMetadata =
+                static_cast<GridMetadata *>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(
+                    metaDataByteSize, defaultStream.stream()));
+            deviceBatchMetadataPtr = static_cast<GridBatchMetadata *>(
+                c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(sizeof(GridBatchMetadata),
+                                                                       defaultStream.stream()));
         }
 
         // Populate host and/or device metadata
         const bool isGridMutable = gridHdl.gridType(0) == nanovdb::GridType::OnIndexMask;
         ops::dispatchPopulateGridMetadata<DeviceTag>(
-            gridHdl, voxelSizes, voxelOrigins, isGridMutable,
-            mBatchOffsets,
+            gridHdl, voxelSizes, voxelOrigins, isGridMutable, mBatchOffsets,
             mHostGridMetadata.data(), mDeviceGridMetadata, &mBatchMetadata, deviceBatchMetadataPtr);
-        TORCH_CHECK(listIndices.numel() == 0 || listIndices.size(0) == (mBatchOffsets.size(0) - 1), "Invalid list indices when building grid");
+        TORCH_CHECK(listIndices.numel() == 0 || listIndices.size(0) == (mBatchOffsets.size(0) - 1),
+                    "Invalid list indices when building grid");
         mListIndices = listIndices;
 
-        // We don't need the device copy of the global batch metadata anymore (we only carry around the host version and pass it by value to device kernels), so delete it
+        // We don't need the device copy of the global batch metadata anymore (we only carry around
+        // the host version and pass it by value to device kernels), so delete it
         if constexpr (DeviceTag == torch::kCUDA) {
             c10::cuda::CUDACachingAllocator::raw_delete(deviceBatchMetadataPtr);
         }
     });
 
-
     // FIXME: This is slow
     // Populate batch offsets for each leaf node
     {
@@ -324,8 +366,7 @@ void GridBatchImpl::setGrid(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
         leafBatchIdxs.reserve(gridHdl.gridCount());
         for (uint32_t i = 0; i < gridHdl.gridCount(); i += 1) {
             leafBatchIdxs.push_back(
-                torch::full({mHostGridMetadata[i].mNumLeaves},
-                            static_cast<fvdb::JIdxType>(i),
+                torch::full({ mHostGridMetadata[i].mNumLeaves }, static_cast<fvdb::JIdxType>(i),
                             torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(device)));
         }
         mLeafBatchIndices = torch::cat(leafBatchIdxs, 0);
@@ -335,39 +376,43 @@ void GridBatchImpl::setGrid(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
     mGridHdl = std::make_shared<nanovdb::GridHandle<TorchDeviceBuffer>>(std::move(gridHdl));
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(int64_t bi) const {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::index(int64_t bi) const {
     bi = negativeToPositiveIndexWithRangecheck(bi);
 
-    return index(bi, bi+1, 1);
+    return index(bi, bi + 1, 1);
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(const torch::Tensor& indices) const {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::index(const torch::Tensor &indices) const {
     TORCH_CHECK_INDEX(indices.dim() == 1, "indices must be a 1D tensor");
     TORCH_CHECK_INDEX(!indices.is_floating_point(), "indices must be an integer tensor");
 
     torch::Tensor numericIndices;
-    if(indices.scalar_type() == torch::kBool) {
+    if (indices.scalar_type() == torch::kBool) {
         TORCH_CHECK_INDEX(indices.dim() == 1, "bool indices must be a 1D tensor");
-        TORCH_CHECK_INDEX(indices.numel() == batchSize(), "bool indices must have the same number of entries as grids in the batch");
-        numericIndices = torch::arange(batchSize(), torch::TensorOptions().dtype(torch::kInt64).device(indices.device()));
+        TORCH_CHECK_INDEX(
+            indices.numel() == batchSize(),
+            "bool indices must have the same number of entries as grids in the batch");
+        numericIndices = torch::arange(
+            batchSize(), torch::TensorOptions().dtype(torch::kInt64).device(indices.device()));
         numericIndices = numericIndices.masked_select(indices);
     } else {
         numericIndices = indices;
     }
 
-    torch::Tensor indicesCpu = numericIndices.to(torch::kCPU).to(torch::kInt64);
-    auto indicesAccessor = indicesCpu.accessor<int64_t, 1>();
+    torch::Tensor indicesCpu      = numericIndices.to(torch::kCPU).to(torch::kInt64);
+    auto          indicesAccessor = indicesCpu.accessor<int64_t, 1>();
     return indexInternal(indicesAccessor, indicesAccessor.size(0));
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(const std::vector<int64_t>& indices) const {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::index(const std::vector<int64_t> &indices) const {
     return indexInternal(indices, indices.size());
 }
 
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(const std::vector<bool>& indices) const {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::index(const std::vector<bool> &indices) const {
     std::vector<int64_t> indicesInt;
     indicesInt.reserve(indices.size());
     for (size_t i = 0; i < indices.size(); i += 1) {
@@ -379,15 +424,16 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(const std::vector<bool>&
     return indexInternal(indicesInt, indicesInt.size());
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(ssize_t start, ssize_t stop, ssize_t step) const {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::index(ssize_t start, ssize_t stop, ssize_t step) const {
     struct RangeAccessor {
         ssize_t mStart;
         ssize_t mStop;
         ssize_t mStep;
         ssize_t mLen;
 
-        RangeAccessor(ssize_t start, ssize_t stop, ssize_t step, ssize_t batchSize) : mStart(start), mStop(stop), mStep(step) {
+        RangeAccessor(ssize_t start, ssize_t stop, ssize_t step, ssize_t batchSize)
+            : mStart(start), mStop(stop), mStep(step) {
             TORCH_CHECK_INDEX(step != 0, "slice step cannot be zero");
             TORCH_CHECK_INDEX(0 <= start && start <= batchSize, "slice index out of range");
             TORCH_CHECK_INDEX(-1 <= stop && stop <= batchSize, "slice index out of range");
@@ -399,10 +445,12 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(ssize_t start, ssize_t st
             } else if (stop <= start && step < 0) {
                 mLen = (mStart - mStop - mStep - 1) / -mStep;
             } else {
-                TORCH_CHECK_INDEX(false, "Invalid slice start=", start, ", stop=", stop, ", step=", step, " for batch size ", batchSize);
+                TORCH_CHECK_INDEX(false, "Invalid slice start=", start, ", stop=", stop,
+                                  ", step=", step, " for batch size ", batchSize);
             }
         }
-        size_t operator[](size_t i) const {
+        size_t
+        operator[](size_t i) const {
             return mStart + i * mStep;
         }
     };
@@ -411,21 +459,19 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::index(ssize_t start, ssize_t st
     return indexInternal(acc, acc.mLen);
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::concatenate(
-        const std::vector<c10::intrusive_ptr<GridBatchImpl>>& elements) {
-
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::concatenate(const std::vector<c10::intrusive_ptr<GridBatchImpl>> &elements) {
     TORCH_CHECK_VALUE(elements.size() > 0, "Must provide at least one grid for concatenate!")
 
-    torch::Device device = elements[0]->device();
-    bool isMutable = elements[0]->isMutable();
+    torch::Device device    = elements[0]->device();
+    bool          isMutable = elements[0]->isMutable();
 
     std::vector<std::shared_ptr<nanovdb::GridHandle<TorchDeviceBuffer>>> handles;
-    std::vector<std::vector<int64_t>> byteSizes;
-    std::vector<std::vector<int64_t>> readByteOffsets;
-    std::vector<std::vector<int64_t>> writeByteOffsets;
-    int64_t totalByteSize = 0;
-    int64_t totalGrids = 0;
+    std::vector<std::vector<int64_t>>                                    byteSizes;
+    std::vector<std::vector<int64_t>>                                    readByteOffsets;
+    std::vector<std::vector<int64_t>>                                    writeByteOffsets;
+    int64_t                                                              totalByteSize = 0;
+    int64_t                                                              totalGrids    = 0;
     handles.reserve(elements.size());
     byteSizes.reserve(elements.size());
     readByteOffsets.reserve(elements.size());
@@ -434,8 +480,10 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::concatenate(
     std::vector<nanovdb::Vec3d> voxelSizes, voxelOrigins;
 
     for (size_t i = 0; i < elements.size(); i += 1) {
-        TORCH_CHECK(elements[i]->device() == device, "All grid batches must be on the same device!");
-        TORCH_CHECK(elements[i]->isMutable() == isMutable, "All grid batches must have the same mutability!");
+        TORCH_CHECK(elements[i]->device() == device,
+                    "All grid batches must be on the same device!");
+        TORCH_CHECK(elements[i]->isMutable() == isMutable,
+                    "All grid batches must have the same mutability!");
 
         // Empty grids don't contribute to the concatenation
         if (elements[i]->batchSize() == 0) {
@@ -457,21 +505,22 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::concatenate(
             voxelSizes.push_back(elements[i]->voxelSize(j));
             voxelOrigins.push_back(elements[i]->voxelOrigin(j));
 
-            readByteOffsets.back().push_back(elements[i]->cumBytes(j));  // Where to start reading from in the current grid
-            byteSizes.back().push_back(elements[i]->numBytes(j));        // How many bytes to read
-            writeByteOffsets.back().push_back(totalByteSize);            // Where to start writing to in the concatenated grid
+            readByteOffsets.back().push_back(
+                elements[i]->cumBytes(j)); // Where to start reading from in the current grid
+            byteSizes.back().push_back(elements[i]->numBytes(j)); // How many bytes to read
+            writeByteOffsets.back().push_back(
+                totalByteSize); // Where to start writing to in the concatenated grid
             totalByteSize += elements[i]->numBytes(j);
         }
-
     }
     if (handles.size() == 0) {
         return c10::make_intrusive<GridBatchImpl>(device, isMutable);
     }
 
-    const bool isHost = device.is_cpu();
+    const bool        isHost = device.is_cpu();
     TorchDeviceBuffer buffer(totalByteSize, nullptr, isHost, device.index());
 
-    int count = 0;
+    int count         = 0;
     int nonEmptyCount = 0;
     if (isHost) {
         for (size_t i = 0; i < elements.size(); i += 1) {
@@ -480,50 +529,53 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::concatenate(
             }
 
             for (size_t j = 0; j < elements[i]->batchSize(); j += 1) {
-                const int64_t readOffset = readByteOffsets[nonEmptyCount][j];
+                const int64_t readOffset  = readByteOffsets[nonEmptyCount][j];
                 const int64_t writeOffset = writeByteOffsets[nonEmptyCount][j];
-                const int64_t numBytes = byteSizes[nonEmptyCount][j];
+                const int64_t numBytes    = byteSizes[nonEmptyCount][j];
 
-                nanovdb::GridData* dst = reinterpret_cast<nanovdb::GridData*>(buffer.data() + writeOffset);
-                const uint8_t* src = elements[i]->mGridHdl->buffer().data() + readOffset;
-                memcpy((void*) dst, (void*) src, numBytes);
+                nanovdb::GridData *dst =
+                    reinterpret_cast<nanovdb::GridData *>(buffer.data() + writeOffset);
+                const uint8_t *src = elements[i]->mGridHdl->buffer().data() + readOffset;
+                memcpy((void *)dst, (void *)src, numBytes);
                 nanovdb::tools::updateGridCount(dst, count++, totalGrids);
             }
             nonEmptyCount += 1;
         }
-    }
-    else {
+    } else {
         for (size_t i = 0; i < elements.size(); i += 1) {
             if (elements[i]->batchSize() == 0) {
                 continue;
             }
 
             for (size_t j = 0; j < elements[i]->batchSize(); j += 1) {
-                const int64_t readOffset = readByteOffsets[nonEmptyCount][j];
+                const int64_t readOffset  = readByteOffsets[nonEmptyCount][j];
                 const int64_t writeOffset = writeByteOffsets[nonEmptyCount][j];
-                const int64_t numBytes = byteSizes[nonEmptyCount][j];
+                const int64_t numBytes    = byteSizes[nonEmptyCount][j];
 
                 c10::cuda::CUDAGuard deviceGuard(device.index());
-                nanovdb::GridData* dst = reinterpret_cast<nanovdb::GridData*>(buffer.deviceData() + writeOffset);
-                const uint8_t* src = elements[i]->mGridHdl->buffer().deviceData() + readOffset;
-                cudaMemcpyAsync((uint8_t*) dst, src, numBytes, cudaMemcpyDeviceToDevice);
+                nanovdb::GridData   *dst =
+                    reinterpret_cast<nanovdb::GridData *>(buffer.deviceData() + writeOffset);
+                const uint8_t *src = elements[i]->mGridHdl->buffer().deviceData() + readOffset;
+                cudaMemcpyAsync((uint8_t *)dst, src, numBytes, cudaMemcpyDeviceToDevice);
 
                 bool dirty, *d_dirty;
-                cudaMallocAsync((void**)&d_dirty, sizeof(bool), 0);
+                cudaMallocAsync((void **)&d_dirty, sizeof(bool), 0);
                 nanovdb::cuda::updateGridCount<<<1, 1>>>(dst, count++, totalGrids, d_dirty);
                 C10_CUDA_KERNEL_LAUNCH_CHECK();
                 cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost);
-                if (dirty) nanovdb::tools::cuda::updateChecksum(dst, nanovdb::CheckMode::Partial);
+                if (dirty)
+                    nanovdb::tools::cuda::updateChecksum(dst, nanovdb::CheckMode::Partial);
             }
             nonEmptyCount += 1;
         }
     }
-    nanovdb::GridHandle<TorchDeviceBuffer> gridHdl = nanovdb::GridHandle<TorchDeviceBuffer>(std::move(buffer));
+    nanovdb::GridHandle<TorchDeviceBuffer> gridHdl =
+        nanovdb::GridHandle<TorchDeviceBuffer>(std::move(buffer));
     return c10::make_intrusive<GridBatchImpl>(std::move(gridHdl), voxelSizes, voxelOrigins);
 }
 
-
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::contiguous(c10::intrusive_ptr<GridBatchImpl> input) {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::contiguous(c10::intrusive_ptr<GridBatchImpl> input) {
     if (input->isContiguous()) {
         return input;
     }
@@ -535,10 +587,10 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::contiguous(c10::intrusive_ptr<G
         totalByteSize += input->numBytes(i);
     }
 
-    const bool isHost = input->device().is_cpu();
+    const bool        isHost = input->device().is_cpu();
     TorchDeviceBuffer buffer(totalByteSize, nullptr, isHost, input->device().index());
 
-    int64_t writeOffset = 0;
+    int64_t                     writeOffset = 0;
     std::vector<nanovdb::Vec3d> voxelSizes, voxelOrigins;
     voxelSizes.reserve(input->batchSize());
     voxelOrigins.reserve(input->batchSize());
@@ -548,39 +600,42 @@ c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::contiguous(c10::intrusive_ptr<G
             voxelSizes.push_back(input->voxelSize(i));
             voxelOrigins.push_back(input->voxelOrigin(i));
 
-            nanovdb::GridData* dst = reinterpret_cast<nanovdb::GridData*>(buffer.data() + writeOffset);
-            const uint8_t* src = input->nanoGridHandle().buffer().data() + input->cumBytes(i);
-            memcpy((void*) dst, (void*) src, input->numBytes(i));
+            nanovdb::GridData *dst =
+                reinterpret_cast<nanovdb::GridData *>(buffer.data() + writeOffset);
+            const uint8_t *src = input->nanoGridHandle().buffer().data() + input->cumBytes(i);
+            memcpy((void *)dst, (void *)src, input->numBytes(i));
             nanovdb::tools::updateGridCount(dst, i, totalGrids);
             writeOffset += input->numBytes(i);
         }
 
-    }
-    else {
+    } else {
         for (size_t i = 0; i < input->batchSize(); i += 1) {
             voxelSizes.push_back(input->voxelSize(i));
             voxelOrigins.push_back(input->voxelOrigin(i));
 
             c10::cuda::CUDAGuard deviceGuard(input->device().index());
-            nanovdb::GridData* dst = reinterpret_cast<nanovdb::GridData*>(buffer.deviceData() + writeOffset);
-            const uint8_t* src = input->nanoGridHandle().buffer().deviceData() + input->cumBytes(i);
-            cudaMemcpyAsync((uint8_t*) dst, src, input->numBytes(i), cudaMemcpyDeviceToDevice);
+            nanovdb::GridData   *dst =
+                reinterpret_cast<nanovdb::GridData *>(buffer.deviceData() + writeOffset);
+            const uint8_t *src = input->nanoGridHandle().buffer().deviceData() + input->cumBytes(i);
+            cudaMemcpyAsync((uint8_t *)dst, src, input->numBytes(i), cudaMemcpyDeviceToDevice);
 
             bool dirty, *d_dirty;
-            cudaMallocAsync((void**)&d_dirty, sizeof(bool), 0);
+            cudaMallocAsync((void **)&d_dirty, sizeof(bool), 0);
             nanovdb::cuda::updateGridCount<<<1, 1>>>(dst, i, totalGrids, d_dirty);
             C10_CUDA_KERNEL_LAUNCH_CHECK();
             cudaMemcpyAsync(&dirty, d_dirty, sizeof(bool), cudaMemcpyDeviceToHost);
-            if (dirty) nanovdb::tools::cuda::updateChecksum(dst, nanovdb::CheckMode::Partial);
+            if (dirty)
+                nanovdb::tools::cuda::updateChecksum(dst, nanovdb::CheckMode::Partial);
             writeOffset += input->numBytes(i);
         }
     }
 
-    return c10::make_intrusive<GridBatchImpl>(nanovdb::GridHandle<TorchDeviceBuffer>(std::move(buffer)), voxelSizes, voxelOrigins);
+    return c10::make_intrusive<GridBatchImpl>(
+        nanovdb::GridHandle<TorchDeviceBuffer>(std::move(buffer)), voxelSizes, voxelOrigins);
 }
 
-
-JaggedTensor GridBatchImpl::jaggedTensor(const torch::Tensor& data, bool ignoreDisabledVoxels) const {
+JaggedTensor
+GridBatchImpl::jaggedTensor(const torch::Tensor &data, bool ignoreDisabledVoxels) const {
     checkDevice(data);
     TORCH_CHECK(data.dim() >= 1, "Data have more than one dimensions");
     if (ignoreDisabledVoxels || !isMutable()) {
@@ -588,45 +643,52 @@ JaggedTensor GridBatchImpl::jaggedTensor(const torch::Tensor& data, bool ignoreD
     } else {
         // TODO: (@fwilliams) check data size need to call totalActiveVoxels()
     }
-    return JaggedTensor::from_data_offsets_and_list_ids(data, voxelOffsets(ignoreDisabledVoxels), jlidx(ignoreDisabledVoxels));
+    return JaggedTensor::from_data_offsets_and_list_ids(data, voxelOffsets(ignoreDisabledVoxels),
+                                                        jlidx(ignoreDisabledVoxels));
 }
 
-
-int64_t GridBatchImpl::totalEnabledVoxels(bool ignoreDisabledVoxels) const {
+int64_t
+GridBatchImpl::totalEnabledVoxels(bool ignoreDisabledVoxels) const {
     if (!isMutable() || ignoreDisabledVoxels) {
         return totalVoxels();
     }
-    return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-        return ops::dispatchCountEnabledVoxels<DeviceTag>(*this, -1);
-    });
+    return FVDB_DISPATCH_KERNEL_DEVICE(
+        device(), [&]() { return ops::dispatchCountEnabledVoxels<DeviceTag>(*this, -1); });
 }
 
-
-torch::Tensor GridBatchImpl::jidx(bool ignoreDisabledVoxels) const {
+torch::Tensor
+GridBatchImpl::jidx(bool ignoreDisabledVoxels) const {
     return FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
         if (batchSize() == 1 || totalVoxels() == 0) {
-            return torch::empty({0}, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(device()));
+            return torch::empty(
+                { 0 }, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(device()));
         }
         return ops::dispatchJIdxForGrid<DeviceTag>(*this, ignoreDisabledVoxels);
     });
 }
 
-torch::Tensor GridBatchImpl::jlidx(bool ignoreDisabledVoxels) const {
+torch::Tensor
+GridBatchImpl::jlidx(bool ignoreDisabledVoxels) const {
     return mListIndices;
 }
 
-torch::Tensor GridBatchImpl::voxelOffsets(bool ignoreDisabledVoxels) const {
+torch::Tensor
+GridBatchImpl::voxelOffsets(bool ignoreDisabledVoxels) const {
     if (!isMutable() || ignoreDisabledVoxels) {
         return mBatchOffsets;
-    } else  {
+    } else {
         // FIXME: This is slow for mutable grids
-        TORCH_CHECK(isMutable(), "This grid is not mutable, cannot get voxel offsets. This should never happen.");
-        torch::Tensor numEnabledPerGrid = torch::empty({batchSize() + 1}, torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(torch::kCPU));
+        TORCH_CHECK(
+            isMutable(),
+            "This grid is not mutable, cannot get voxel offsets. This should never happen.");
+        torch::Tensor numEnabledPerGrid = torch::empty(
+            { batchSize() + 1 },
+            torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(torch::kCPU));
         auto acc = numEnabledPerGrid.accessor<int64_t, 1>();
-        acc[0] = 0;
+        acc[0]   = 0;
         for (int i = 1; i < (batchSize() + 1); i += 1) {
             acc[i] = FVDB_DISPATCH_KERNEL_DEVICE(device(), [&]() {
-                return ops::dispatchCountEnabledVoxels<DeviceTag>(*this, i-1);
+                return ops::dispatchCountEnabledVoxels<DeviceTag>(*this, i - 1);
             });
         }
         numEnabledPerGrid = numEnabledPerGrid.to(device());
@@ -634,39 +696,43 @@ torch::Tensor GridBatchImpl::voxelOffsets(bool ignoreDisabledVoxels) const {
     }
 }
 
-torch::Tensor GridBatchImpl::serialize() const {
+torch::Tensor
+GridBatchImpl::serialize() const {
     return serializeV0();
 }
 
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::deserialize(const torch::Tensor& serialized) {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::deserialize(const torch::Tensor &serialized) {
     return deserializeV0(serialized);
 }
 
-
-torch::Tensor GridBatchImpl::serializeV0() const {
-    c10::intrusive_ptr<GridBatchImpl> self = c10::intrusive_ptr<GridBatchImpl>::reclaim_copy((GridBatchImpl*) this);
+torch::Tensor
+GridBatchImpl::serializeV0() const {
+    c10::intrusive_ptr<GridBatchImpl> self =
+        c10::intrusive_ptr<GridBatchImpl>::reclaim_copy((GridBatchImpl *)this);
     if (!device().is_cpu()) {
         self = clone(torch::kCPU, true);
     }
 
-    int64_t numGrids = self->nanoGridHandle().gridCount();
+    int64_t numGrids   = self->nanoGridHandle().gridCount();
     int64_t hdlBufSize = self->nanoGridHandle().buffer().size();
 
     struct V01Header {
-        uint64_t magic = 0x0F0F0F0F0F0F0F0F;
+        uint64_t magic   = 0x0F0F0F0F0F0F0F0F;
         uint64_t version = 0;
         uint64_t numGrids;
         uint64_t totalBytes;
     } header;
 
-    const int64_t headerSize = sizeof(V01Header) + numGrids * sizeof(GridMetadata) + sizeof(GridBatchMetadata);
+    const int64_t headerSize =
+        sizeof(V01Header) + numGrids * sizeof(GridMetadata) + sizeof(GridBatchMetadata);
     const int64_t totalByteSize = headerSize + hdlBufSize;
 
     header.totalBytes = totalByteSize;
-    header.numGrids = numGrids;
+    header.numGrids   = numGrids;
 
-    torch::Tensor ret = torch::empty({totalByteSize}, torch::kInt8);
-    int8_t* retPtr = ret.data_ptr<int8_t>();
+    torch::Tensor ret    = torch::empty({ totalByteSize }, torch::kInt8);
+    int8_t       *retPtr = ret.data_ptr<int8_t>();
 
     memcpy(retPtr, &header, sizeof(V01Header));
     retPtr += sizeof(V01Header);
@@ -680,41 +746,49 @@ torch::Tensor GridBatchImpl::serializeV0() const {
     memcpy(retPtr, self->nanoGridHandle().buffer().data(), hdlBufSize);
     retPtr += hdlBufSize;
 
-    TORCH_CHECK(retPtr == (ret.data_ptr<int8_t>() + totalByteSize), "Something went wrong with serialization");
+    TORCH_CHECK(retPtr == (ret.data_ptr<int8_t>() + totalByteSize),
+                "Something went wrong with serialization");
 
     return ret;
 }
 
-c10::intrusive_ptr<GridBatchImpl> GridBatchImpl::deserializeV0(const torch::Tensor& serialized) {
+c10::intrusive_ptr<GridBatchImpl>
+GridBatchImpl::deserializeV0(const torch::Tensor &serialized) {
     struct V01Header {
-        uint64_t magic = 0x0F0F0F0F0F0F0F0F;
+        uint64_t magic   = 0x0F0F0F0F0F0F0F0F;
         uint64_t version = 0;
         uint64_t numGrids;
         uint64_t totalBytes;
     };
 
     TORCH_CHECK(serialized.scalar_type() == torch::kInt8, "Serialized data must be of type int8");
-    TORCH_CHECK(serialized.numel() >= sizeof(V01Header), "Serialized data is too small to be a valid grid handle");
+    TORCH_CHECK(serialized.numel() >= sizeof(V01Header),
+                "Serialized data is too small to be a valid grid handle");
 
-    const int8_t* serializedPtr = serialized.data_ptr<int8_t>();
+    const int8_t *serializedPtr = serialized.data_ptr<int8_t>();
 
-    const V01Header* header = reinterpret_cast<const V01Header*>(serializedPtr);
+    const V01Header *header = reinterpret_cast<const V01Header *>(serializedPtr);
     TORCH_CHECK(header->magic == 0x0F0F0F0F0F0F0F0F, "Serialized data is not a valid grid handle");
     TORCH_CHECK(header->version == 0, "Serialized data is not a valid grid handle");
-    TORCH_CHECK(serialized.numel() == header->totalBytes, "Serialized data is not a valid grid handle");
+    TORCH_CHECK(serialized.numel() == header->totalBytes,
+                "Serialized data is not a valid grid handle");
 
     const uint64_t numGrids = header->numGrids;
 
-    const GridBatchMetadata* batchMetadata = reinterpret_cast<const GridBatchMetadata*>(serializedPtr + sizeof(V01Header));
+    const GridBatchMetadata *batchMetadata =
+        reinterpret_cast<const GridBatchMetadata *>(serializedPtr + sizeof(V01Header));
     TORCH_CHECK(batchMetadata->version == 1, "Serialized data is not a valid grid handle");
 
-    const GridMetadata* gridMetadata = reinterpret_cast<const GridMetadata*>(serializedPtr + sizeof(V01Header) + sizeof(GridBatchMetadata));
+    const GridMetadata *gridMetadata = reinterpret_cast<const GridMetadata *>(
+        serializedPtr + sizeof(V01Header) + sizeof(GridBatchMetadata));
     for (uint64_t i = 0; i < numGrids; i += 1) {
         TORCH_CHECK(gridMetadata[i].version == 1, "Serialized data is not a valid grid handle");
     }
-    const int8_t* gridBuffer = serializedPtr + sizeof(V01Header) + sizeof(GridBatchMetadata) + numGrids * sizeof(GridMetadata);
+    const int8_t *gridBuffer = serializedPtr + sizeof(V01Header) + sizeof(GridBatchMetadata) +
+                               numGrids * sizeof(GridMetadata);
 
-    const uint64_t sizeofMetadata = sizeof(V01Header) + sizeof(GridBatchMetadata) + numGrids * sizeof(GridMetadata);
+    const uint64_t sizeofMetadata =
+        sizeof(V01Header) + sizeof(GridBatchMetadata) + numGrids * sizeof(GridMetadata);
     const uint64_t sizeofGrid = header->totalBytes - sizeofMetadata;
 
     auto buf = TorchDeviceBuffer(sizeofGrid, nullptr, true /* host */, -1 /* deviceIndex */);
diff --git a/fvdb/src/detail/GridBatchImpl.h b/fvdb/src/detail/GridBatchImpl.h
index 72a678e58e..dbff710882 100644
--- a/fvdb/src/detail/GridBatchImpl.h
+++ b/fvdb/src/detail/GridBatchImpl.h
@@ -1,51 +1,55 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-#include <vector>
+#ifndef FVDB_DETAIL_GRIDBATCHIMPL_H
+#define FVDB_DETAIL_GRIDBATCHIMPL_H
 
-#include <torch/all.h>
+#include "TorchDeviceBuffer.h"
+#include "VoxelCoordTransform.h"
+#include "utils/Utils.h"
+
+#include <JaggedTensor.h>
 
-#include <nanovdb/NanoVDB.h>
 #include <nanovdb/GridHandle.h>
+#include <nanovdb/NanoVDB.h>
 
-#include "VoxelCoordTransform.h"
-#include "JaggedTensor.h"
-#include "detail/utils/Utils.h"
-#include "detail/TorchDeviceBuffer.h"
+#include <torch/all.h>
 
+#include <vector>
 
 #if !defined(__CUDACC__) && !defined(__restrict__)
 #define __restrict__
 #endif
 
-
 namespace fvdb {
 namespace detail {
 
 class GridBatchImpl : public torch::CustomClassHolder {
-
-public:
+  public:
     // Metadata about a single grid in the batch
     struct GridMetadata {
-        uint32_t version = 1; // Version of this struct
-
-        int64_t mCumLeaves = 0; // Cumulative number of leaf nodes in the batch up to this grid
-        int64_t mCumVoxels = 0; // Cumulative number of voxels in the batch up to this grid
-        uint64_t mCumBytes = 0;  // Cumulative number of bytes in the buffer of grids up to this grid
-        VoxelCoordTransform mPrimalTransform;  // Primal Transform of this grid (i.e. transform which aligns origin with voxel center)
-        VoxelCoordTransform mDualTransform;    // Dual Transform of this grid (i.e. transform which aligns origin with voxel corner)
-        nanovdb::Vec3d mVoxelSize;             // Size of a single voxel in world space
-        uint32_t mNumLeaves;               // Number of leaf nodes in this grid
-        int64_t mNumVoxels;               // Number of voxels in this grid
-        uint64_t mNumBytes;                // Number of bytes in the buffer of this grid
-        nanovdb::CoordBBox mBBox;          // Bounding box of this grid
-
-        nanovdb::Vec3d voxelOrigin() const {
+        uint32_t version = 1;    // Version of this struct
+
+        int64_t  mCumLeaves = 0; // Cumulative number of leaf nodes in the batch up to this grid
+        int64_t  mCumVoxels = 0; // Cumulative number of voxels in the batch up to this grid
+        uint64_t mCumBytes = 0; // Cumulative number of bytes in the buffer of grids up to this grid
+        VoxelCoordTransform mPrimalTransform; // Primal Transform of this grid (i.e. transform which
+                                              // aligns origin with voxel center)
+        VoxelCoordTransform mDualTransform;   // Dual Transform of this grid (i.e. transform which
+                                              // aligns origin with voxel corner)
+        nanovdb::Vec3d     mVoxelSize;        // Size of a single voxel in world space
+        uint32_t           mNumLeaves;        // Number of leaf nodes in this grid
+        int64_t            mNumVoxels;        // Number of voxels in this grid
+        uint64_t           mNumBytes;         // Number of bytes in the buffer of this grid
+        nanovdb::CoordBBox mBBox;             // Bounding box of this grid
+
+        nanovdb::Vec3d
+        voxelOrigin() const {
             return mPrimalTransform.applyInv<double>(0., 0., 0.);
         }
 
-        __hostdev__ void setTransform(const nanovdb::Vec3d& voxSize, const nanovdb::Vec3d& voxOrigin) {
+        __hostdev__ void
+        setTransform(const nanovdb::Vec3d &voxSize, const nanovdb::Vec3d &voxOrigin) {
             mVoxelSize = voxSize;
             voxelTransformForSizeAndOrigin(voxSize, voxOrigin, mPrimalTransform, mDualTransform);
         }
@@ -77,63 +81,72 @@ class GridBatchImpl : public torch::CustomClassHolder {
         bool mIsContiguous = true;
     };
 
-
-private:
+  private:
     // Metadata for each grid in the batch. There is a seperate host and device version of these.
-    // The caller of this class sets the host version and is responsible for syncing the device version
-    // with the host version by calling syncMetadataToDeviceIfCUDA
-    std::vector<GridMetadata> mHostGridMetadata;  // CPU only
-    GridMetadata* mDeviceGridMetadata = nullptr;  // CUDA only
+    // The caller of this class sets the host version and is responsible for syncing the device
+    // version with the host version by calling syncMetadataToDeviceIfCUDA
+    std::vector<GridMetadata> mHostGridMetadata;             // CPU only
+    GridMetadata             *mDeviceGridMetadata = nullptr; // CUDA only
 
-    GridBatchMetadata mBatchMetadata;  // Metadata about the whole batch
+    GridBatchMetadata mBatchMetadata;                        // Metadata about the whole batch
 
-    std::shared_ptr<nanovdb::GridHandle<TorchDeviceBuffer>> mGridHdl;  // NanoVDB grid handle
-    torch::Tensor mLeafBatchIndices;                                   // Indices of leaf nodes in the batch shape = [total_leafs]
-    torch::Tensor mBatchOffsets;                                       // Batch indices for grid (ignores disabled)
-    torch::Tensor mListIndices;                                        // List indices for grid (same as JaggedTensor, ignores disabled)
+    std::shared_ptr<nanovdb::GridHandle<TorchDeviceBuffer>> mGridHdl; // NanoVDB grid handle
+    torch::Tensor mLeafBatchIndices; // Indices of leaf nodes in the batch shape = [total_leafs]
+    torch::Tensor mBatchOffsets;     // Batch indices for grid (ignores disabled)
+    torch::Tensor mListIndices; // List indices for grid (same as JaggedTensor, ignores disabled)
 
     // Write back changes to host metadata to the device if we're a cuda handle
     void syncMetadataToDeviceIfCUDA(bool blocking);
 
-    inline std::pair<nanovdb::Vec3d, nanovdb::Vec3d> fineVoxSizeAndOrigin(int64_t bi, nanovdb::Coord subdivFactor) const {
-        TORCH_CHECK(subdivFactor[0] > 0 && subdivFactor[1] > 0 && subdivFactor[2] > 0, "Subdivision factor must be greater than 0");
+    inline std::pair<nanovdb::Vec3d, nanovdb::Vec3d>
+    fineVoxSizeAndOrigin(int64_t bi, nanovdb::Coord subdivFactor) const {
+        TORCH_CHECK(subdivFactor[0] > 0 && subdivFactor[1] > 0 && subdivFactor[2] > 0,
+                    "Subdivision factor must be greater than 0");
         const nanovdb::Vec3d w = voxelSize(bi) / subdivFactor.asVec3d();
-        const nanovdb::Vec3d tx = voxelOrigin(bi) - (subdivFactor.asVec3d() - nanovdb::Vec3d(1.0)) * w * 0.5;
+        const nanovdb::Vec3d tx =
+            voxelOrigin(bi) - (subdivFactor.asVec3d() - nanovdb::Vec3d(1.0)) * w * 0.5;
         return std::make_pair(w, tx);
     }
 
-    inline std::pair<nanovdb::Vec3d, nanovdb::Vec3d> coarseVoxSizeAndOrigin(int64_t bi, nanovdb::Coord branchingFactor) const {
-        TORCH_CHECK(branchingFactor[0] > 0 && branchingFactor[1] > 0 && branchingFactor[2] > 0, "Coarsening factor must be greater than 0");
+    inline std::pair<nanovdb::Vec3d, nanovdb::Vec3d>
+    coarseVoxSizeAndOrigin(int64_t bi, nanovdb::Coord branchingFactor) const {
+        TORCH_CHECK(branchingFactor[0] > 0 && branchingFactor[1] > 0 && branchingFactor[2] > 0,
+                    "Coarsening factor must be greater than 0");
         const nanovdb::Vec3d w = branchingFactor.asVec3d() * voxelSize(bi);
-        const nanovdb::Vec3d tx = (branchingFactor.asVec3d() - nanovdb::Vec3d(1.0)) * voxelSize(bi) * 0.5 + voxelOrigin(bi);
+        const nanovdb::Vec3d tx =
+            (branchingFactor.asVec3d() - nanovdb::Vec3d(1.0)) * voxelSize(bi) * 0.5 +
+            voxelOrigin(bi);
         return std::make_pair(w, tx);
     }
 
-    inline int64_t negativeToPositiveIndexWithRangecheck(int64_t bi) const {
+    inline int64_t
+    negativeToPositiveIndexWithRangecheck(int64_t bi) const {
         if (bi < 0) {
             bi += batchSize();
         }
-        TORCH_CHECK_INDEX(bi >= 0 && bi < batchSize(), "Batch index ", bi, " is out of range for grid batch of size "
-                                                         +  std::to_string(batchSize()));
+        TORCH_CHECK_INDEX(bi >= 0 && bi < batchSize(), "Batch index ", bi,
+                          " is out of range for grid batch of size " + std::to_string(batchSize()));
         return static_cast<int64_t>(bi);
     }
 
     void recomputeBatchOffsets();
 
     template <typename Indexable>
-    c10::intrusive_ptr<GridBatchImpl> indexInternal(const Indexable& idx, int64_t size) const {
+    c10::intrusive_ptr<GridBatchImpl>
+    indexInternal(const Indexable &idx, int64_t size) const {
         if (size == 0) {
             return c10::make_intrusive<GridBatchImpl>(device(), isMutable());
         }
-        TORCH_CHECK(size >= 0, "Indexing with negative size is not supported (this should never happen)");
+        TORCH_CHECK(size >= 0,
+                    "Indexing with negative size is not supported (this should never happen)");
         c10::intrusive_ptr<GridBatchImpl> ret = c10::make_intrusive<GridBatchImpl>();
-        ret->mGridHdl = mGridHdl;
+        ret->mGridHdl                         = mGridHdl;
 
-        int64_t cumVoxels = 0;
-        int64_t cumLeaves = 0;
-        int64_t maxVoxels = 0;
-        uint32_t maxLeafCount = 0;
-        int64_t count = 0;
+        int64_t            cumVoxels    = 0;
+        int64_t            cumLeaves    = 0;
+        int64_t            maxVoxels    = 0;
+        uint32_t           maxLeafCount = 0;
+        int64_t            count        = 0;
         nanovdb::CoordBBox totalBbox;
 
         std::vector<torch::Tensor> leafBatchIdxs;
@@ -142,18 +155,19 @@ class GridBatchImpl : public torch::CustomClassHolder {
         bool isContiguous = mBatchMetadata.mIsContiguous;
         for (size_t i = 0; i < size; i += 1) {
             int64_t bi = idx[i];
-            bi = negativeToPositiveIndexWithRangecheck(bi);
+            bi         = negativeToPositiveIndexWithRangecheck(bi);
 
-            // If indices are not contiguous or the grid we're viewing is not contiguous, then we're no longer contiguous
+            // If indices are not contiguous or the grid we're viewing is not contiguous, then we're
+            // no longer contiguous
             isContiguous = isContiguous && (bi == count);
 
-            const uint32_t numLeaves = mHostGridMetadata[bi].mNumLeaves;
-            const int64_t numVoxels = mHostGridMetadata[bi].mNumVoxels;
-            const nanovdb::CoordBBox& bbox = mHostGridMetadata[bi].mBBox;
+            const uint32_t            numLeaves = mHostGridMetadata[bi].mNumLeaves;
+            const int64_t             numVoxels = mHostGridMetadata[bi].mNumVoxels;
+            const nanovdb::CoordBBox &bbox      = mHostGridMetadata[bi].mBBox;
 
             ret->mHostGridMetadata.push_back(mHostGridMetadata[bi]);
             ret->mHostGridMetadata[count].mCumLeaves = cumLeaves;
-            ret->mHostGridMetadata[count].mCumVoxels= cumVoxels;
+            ret->mHostGridMetadata[count].mCumVoxels = cumVoxels;
 
             if (count == 0) {
                 totalBbox = bbox;
@@ -162,20 +176,23 @@ class GridBatchImpl : public torch::CustomClassHolder {
             }
             cumLeaves += numLeaves;
             cumVoxels += numVoxels;
-            maxVoxels = std::max(maxVoxels, numVoxels);
+            maxVoxels    = std::max(maxVoxels, numVoxels);
             maxLeafCount = std::max(maxLeafCount, numLeaves);
-            leafBatchIdxs.push_back(torch::full({numLeaves}, torch::Scalar(count), torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(device())));
+            leafBatchIdxs.push_back(
+                torch::full({ numLeaves }, torch::Scalar(count),
+                            torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(device())));
             count += 1;
         }
 
-        // If all the indices were contiguous and the grid we're viewing is contiguous, then we're contiguous
+        // If all the indices were contiguous and the grid we're viewing is contiguous, then we're
+        // contiguous
         ret->mBatchMetadata.mIsContiguous = isContiguous && (count == batchSize());
-        ret->mBatchMetadata.mTotalLeaves = cumLeaves;
-        ret->mBatchMetadata.mTotalVoxels = cumVoxels;
-        ret->mBatchMetadata.mMaxVoxels = maxVoxels;
+        ret->mBatchMetadata.mTotalLeaves  = cumLeaves;
+        ret->mBatchMetadata.mTotalVoxels  = cumVoxels;
+        ret->mBatchMetadata.mMaxVoxels    = maxVoxels;
         ret->mBatchMetadata.mMaxLeafCount = maxLeafCount;
-        ret->mBatchMetadata.mTotalBBox = totalBbox;
-        ret->mBatchMetadata.mIsMutable = isMutable();
+        ret->mBatchMetadata.mTotalBBox    = totalBbox;
+        ret->mBatchMetadata.mIsMutable    = isMutable();
 
         if (leafBatchIdxs.size() > 0) {
             ret->mLeafBatchIndices = torch::cat(leafBatchIdxs, 0);
@@ -192,123 +209,137 @@ class GridBatchImpl : public torch::CustomClassHolder {
         return ret;
     }
 
-public:
-    template <typename GridType>
-    class Accessor {
+  public:
+    template <typename GridType> class Accessor {
         friend class GridBatchImpl;
-        const GridBatchImpl::GridMetadata* __restrict__ mMetadata = nullptr;     // 8 bytes
-        const nanovdb::NanoGrid<GridType>* __restrict__ mGridPtr = nullptr;      // 8 bytes
-        fvdb::JIdxType* __restrict__ mLeafBatchIndices = nullptr;                // 8 bytes
-        int64_t mTotalVoxels = 0;                                                // 8 bytes
-        int64_t mTotalLeaves = 0;                                                // 8 bytes
-        int64_t mMaxVoxels = 0;                                                  // 8 bytes
-        uint32_t mMaxLeafCount = 0;                                              // 4 bytes
-        int64_t mGridCount = 0;                                                  // 8 bytes
-
-    private:
-        __hostdev__ inline int64_t negativeToPositiveIndexWithRangecheck(int64_t bi) const {
-        if (bi < 0) {
-            bi += batchSize();
+        const GridBatchImpl::GridMetadata *__restrict__ mMetadata = nullptr; // 8 bytes
+        const nanovdb::NanoGrid<GridType> *__restrict__ mGridPtr  = nullptr; // 8 bytes
+        fvdb::JIdxType *__restrict__ mLeafBatchIndices            = nullptr; // 8 bytes
+        int64_t  mTotalVoxels                                     = 0;       // 8 bytes
+        int64_t  mTotalLeaves                                     = 0;       // 8 bytes
+        int64_t  mMaxVoxels                                       = 0;       // 8 bytes
+        uint32_t mMaxLeafCount                                    = 0;       // 4 bytes
+        int64_t  mGridCount                                       = 0;       // 8 bytes
+
+      private:
+        __hostdev__ inline int64_t
+        negativeToPositiveIndexWithRangecheck(int64_t bi) const {
+            if (bi < 0) {
+                bi += batchSize();
+            }
+            assert(bi >= 0 && bi < batchSize());
+            return static_cast<int64_t>(bi);
         }
-        assert(bi >= 0 && bi < batchSize());
-        return static_cast<int64_t>(bi);
-    }
 
-    public:
-
-        __hostdev__ const nanovdb::NanoGrid<GridType>* grid(int64_t bi) const {
+      public:
+        __hostdev__ const nanovdb::NanoGrid<GridType>                   *
+        grid(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
-            return reinterpret_cast<const nanovdb::NanoGrid<GridType>*>(
-                    reinterpret_cast<const char*>(mGridPtr) + mMetadata[bi].mCumBytes);
+            return reinterpret_cast<const nanovdb::NanoGrid<GridType> *>(
+                reinterpret_cast<const char *>(mGridPtr) + mMetadata[bi].mCumBytes);
         }
 
-        __hostdev__ nanovdb::CoordBBox bbox(int64_t bi) const {
+        __hostdev__ nanovdb::CoordBBox
+                    bbox(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             return grid(bi)->tree().bbox();
         }
 
-        __hostdev__ nanovdb::CoordBBox dualBbox(int64_t bi) const {
+        __hostdev__ nanovdb::CoordBBox
+                    dualBbox(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             nanovdb::CoordBBox dualBbox(bbox(bi));
             dualBbox.mCoord[1] += nanovdb::Coord(1, 1, 1);
             return dualBbox;
         }
 
-        __hostdev__ int64_t batchSize() const {
+        __hostdev__ int64_t
+        batchSize() const {
             return mGridCount;
         }
 
-        __hostdev__ int64_t voxelOffset(int64_t bi) const {
+        __hostdev__ int64_t
+        voxelOffset(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             return mMetadata[bi].mCumVoxels;
         }
 
-        __hostdev__ int64_t leafOffset(int64_t bi) const {
+        __hostdev__ int64_t
+        leafOffset(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             return mMetadata[bi].mCumLeaves;
         }
 
-        __hostdev__ int64_t maxVoxels() const {
+        __hostdev__ int64_t
+        maxVoxels() const {
             return mMaxVoxels;
         }
 
-        __hostdev__ uint32_t maxLeafCount() const {
+        __hostdev__ uint32_t
+        maxLeafCount() const {
             return mMaxLeafCount;
         }
 
-        __hostdev__ int64_t totalVoxels() const {
+        __hostdev__ int64_t
+        totalVoxels() const {
             return mTotalVoxels;
         }
 
-        __hostdev__ int64_t totalLeaves() const {
+        __hostdev__ int64_t
+        totalLeaves() const {
             return mTotalLeaves;
         }
 
-        __hostdev__ const VoxelCoordTransform& primalTransform(int64_t bi) const {
+        __hostdev__ const VoxelCoordTransform &
+        primalTransform(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             return mMetadata[bi].mPrimalTransform;
         }
 
-        __hostdev__ const VoxelCoordTransform& dualTransform(int64_t bi) const {
+        __hostdev__ const VoxelCoordTransform &
+        dualTransform(int64_t bi) const {
             bi = negativeToPositiveIndexWithRangecheck(bi);
             return mMetadata[bi].mDualTransform;
         }
 
-        __hostdev__ fvdb::JIdxType leafBatchIndex(int64_t leaf_idx) const {
+        __hostdev__ fvdb::JIdxType
+                    leafBatchIndex(int64_t leaf_idx) const {
             return mLeafBatchIndices[leaf_idx];
         }
     };
 
     template <typename GridType>
-    Accessor<GridType> hostAccessor() const {
+    Accessor<GridType>
+    hostAccessor() const {
         TORCH_CHECK(!isEmpty(), "Cannot access empty grid");
         Accessor<GridType> ret;
         ret.mMetadata = mHostGridMetadata.data();
-        ret.mGridPtr = mGridHdl->template grid<GridType>();
+        ret.mGridPtr  = mGridHdl->template grid<GridType>();
         TORCH_CHECK(ret.mGridPtr != nullptr, "Failed to get host grid pointer");
-        ret.mTotalVoxels = mBatchMetadata.mTotalVoxels;
-        ret.mTotalLeaves = mBatchMetadata.mTotalLeaves;
-        ret.mMaxVoxels = mBatchMetadata.mMaxVoxels;
-        ret.mMaxLeafCount = mBatchMetadata.mMaxLeafCount;
-        ret.mGridCount = static_cast<int64_t>(mHostGridMetadata.size());
+        ret.mTotalVoxels      = mBatchMetadata.mTotalVoxels;
+        ret.mTotalLeaves      = mBatchMetadata.mTotalLeaves;
+        ret.mMaxVoxels        = mBatchMetadata.mMaxVoxels;
+        ret.mMaxLeafCount     = mBatchMetadata.mMaxLeafCount;
+        ret.mGridCount        = static_cast<int64_t>(mHostGridMetadata.size());
         ret.mLeafBatchIndices = mLeafBatchIndices.data_ptr<fvdb::JIdxType>();
 
         return ret;
     }
 
     template <typename GridType>
-    Accessor<GridType> deviceAccessor() const {
+    Accessor<GridType>
+    deviceAccessor() const {
         TORCH_CHECK(!isEmpty(), "Cannot access empty grid");
         TORCH_CHECK(device().is_cuda(), "Cannot access device accessor on non-CUDA device");
         Accessor<GridType> ret;
         ret.mMetadata = mDeviceGridMetadata;
-        ret.mGridPtr = mGridHdl->template deviceGrid<GridType>();
+        ret.mGridPtr  = mGridHdl->template deviceGrid<GridType>();
         TORCH_CHECK(ret.mGridPtr != nullptr, "Failed to get device grid pointer");
-        ret.mTotalVoxels = mBatchMetadata.mTotalVoxels;
-        ret.mTotalLeaves = mBatchMetadata.mTotalLeaves;
-        ret.mMaxVoxels = mBatchMetadata.mMaxVoxels;
-        ret.mMaxLeafCount = mBatchMetadata.mMaxLeafCount;
-        ret.mGridCount = static_cast<int64_t>(mHostGridMetadata.size());
+        ret.mTotalVoxels      = mBatchMetadata.mTotalVoxels;
+        ret.mTotalLeaves      = mBatchMetadata.mTotalLeaves;
+        ret.mMaxVoxels        = mBatchMetadata.mMaxVoxels;
+        ret.mMaxLeafCount     = mBatchMetadata.mMaxLeafCount;
+        ret.mGridCount        = static_cast<int64_t>(mHostGridMetadata.size());
         ret.mLeafBatchIndices = mLeafBatchIndices.data_ptr<fvdb::JIdxType>();
 
         return ret;
@@ -318,22 +349,21 @@ class GridBatchImpl : public torch::CustomClassHolder {
 
     GridBatchImpl(torch::Device device, bool isMutable);
 
-    GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                  const std::vector<nanovdb::Vec3d>& voxelSizes,
-                  const std::vector<nanovdb::Vec3d>& voxelOrigins);
+    GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl,
+                  const std::vector<nanovdb::Vec3d>       &voxelSizes,
+                  const std::vector<nanovdb::Vec3d>       &voxelOrigins);
 
-    GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                  const nanovdb::Vec3d& globalVoxelSize,
-                  const nanovdb::Vec3d& globalVoxelOrigin);
+    GridBatchImpl(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl,
+                  const nanovdb::Vec3d &globalVoxelSize, const nanovdb::Vec3d &globalVoxelOrigin);
 
     ~GridBatchImpl();
 
     // Cannot move make copies of this handle. There is only one owner of the underlying buffer.
     // This class should only be created and copied through c10::intrusive_ptr
-    GridBatchImpl& operator=(GridBatchImpl&& other) = delete;
-    GridBatchImpl(GridBatchImpl&& other) = delete;
-    GridBatchImpl(GridBatchImpl& other) = delete;
-    GridBatchImpl& operator=(GridBatchImpl& other) = delete;
+    GridBatchImpl &operator=(GridBatchImpl &&other) = delete;
+    GridBatchImpl(GridBatchImpl &&other)            = delete;
+    GridBatchImpl(GridBatchImpl &other)             = delete;
+    GridBatchImpl &operator=(GridBatchImpl &other)  = delete;
 
     torch::Tensor voxelOffsets(bool ignoreDisabledVoxels) const;
 
@@ -341,89 +371,107 @@ class GridBatchImpl : public torch::CustomClassHolder {
 
     torch::Tensor jidx(bool ignoreDisabledVoxels) const;
 
-    int64_t totalLeaves() const {
+    int64_t
+    totalLeaves() const {
         return mBatchMetadata.mTotalLeaves;
     }
 
-    int64_t totalVoxels() const {
+    int64_t
+    totalVoxels() const {
         return mBatchMetadata.mTotalVoxels;
     }
 
     int64_t totalEnabledVoxels(bool ignoreDisabledVoxels) const;
 
-    int64_t maxVoxelsPerGrid() const {
+    int64_t
+    maxVoxelsPerGrid() const {
         return mBatchMetadata.mMaxVoxels;
     }
 
-    int64_t maxLeavesPerGrid() const {
+    int64_t
+    maxLeavesPerGrid() const {
         return static_cast<int64_t>(mBatchMetadata.mMaxLeafCount);
     }
 
-    int64_t batchSize() const {
+    int64_t
+    batchSize() const {
         return static_cast<int64_t>(mHostGridMetadata.size());
     }
 
-    uint64_t totalBytes() const {
+    uint64_t
+    totalBytes() const {
         uint64_t sum = 0;
-        for (const auto& grid : mHostGridMetadata) {
+        for (const auto &grid: mHostGridMetadata) {
             sum += grid.mNumBytes;
         }
         return sum;
     }
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& nanoGridHandle() const {
+    const nanovdb::GridHandle<TorchDeviceBuffer> &
+    nanoGridHandle() const {
         return *mGridHdl;
     }
 
-    bool isMutable() const {
+    bool
+    isMutable() const {
         return mBatchMetadata.mIsMutable;
     }
 
-    const c10::Device device() const {
+    const c10::Device
+    device() const {
         return mGridHdl->buffer().device();
     }
 
-    bool isEmpty() const {
+    bool
+    isEmpty() const {
         return mGridHdl->buffer().isEmpty();
     }
 
-    uint32_t numLeaves(int64_t bi) const {
+    uint32_t
+    numLeaves(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mNumLeaves;
     }
 
-    int64_t numVoxels(int64_t bi) const {
+    int64_t
+    numVoxels(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mNumVoxels;
     }
 
-    int64_t cumVoxels(int64_t bi) const {
+    int64_t
+    cumVoxels(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mCumVoxels;
     }
 
-    uint64_t numBytes(int64_t bi) const {
+    uint64_t
+    numBytes(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mNumBytes;
     }
 
-    uint64_t cumBytes(int64_t bi) const {
+    uint64_t
+    cumBytes(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mCumBytes;
     }
 
-    const VoxelCoordTransform& primalTransform(int64_t bi) const {
+    const VoxelCoordTransform &
+    primalTransform(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mPrimalTransform;
     }
 
-    const VoxelCoordTransform& dualTransform(int64_t bi) const {
+    const VoxelCoordTransform &
+    dualTransform(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mDualTransform;
     }
 
-    void gridVoxelSizesAndOrigins(std::vector<nanovdb::Vec3d>& outVoxelSizes,
-                                  std::vector<nanovdb::Vec3d>& outVoxelOrigins) const {
+    void
+    gridVoxelSizesAndOrigins(std::vector<nanovdb::Vec3d> &outVoxelSizes,
+                             std::vector<nanovdb::Vec3d> &outVoxelOrigins) const {
         outVoxelSizes.clear();
         outVoxelOrigins.clear();
         for (int64_t i = 0; i < batchSize(); ++i) {
@@ -432,29 +480,34 @@ class GridBatchImpl : public torch::CustomClassHolder {
         }
     }
 
-    const nanovdb::CoordBBox& totalBBox() const {
+    const nanovdb::CoordBBox &
+    totalBBox() const {
         return mBatchMetadata.mTotalBBox;
     }
 
-    const nanovdb::CoordBBox& bbox(int64_t bi) const {
+    const nanovdb::CoordBBox &
+    bbox(int64_t bi) const {
         checkNonEmptyGrid();
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mBBox;
     }
 
-    const nanovdb::CoordBBox dualBbox(int64_t bi) const {
-        bi = negativeToPositiveIndexWithRangecheck(bi);
+    const nanovdb::CoordBBox
+    dualBbox(int64_t bi) const {
+        bi                          = negativeToPositiveIndexWithRangecheck(bi);
         nanovdb::CoordBBox dualBbox = bbox(bi);
         dualBbox.mCoord[1] += nanovdb::Coord(1, 1, 1);
         return dualBbox;
     }
 
-    const nanovdb::Vec3d& voxelSize(int64_t bi) const {
+    const nanovdb::Vec3d &
+    voxelSize(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].mVoxelSize;
     }
 
-    const nanovdb::Vec3d voxelOrigin(int64_t bi) const {
+    const nanovdb::Vec3d
+    voxelOrigin(int64_t bi) const {
         bi = negativeToPositiveIndexWithRangecheck(bi);
         return mHostGridMetadata[bi].voxelOrigin();
     }
@@ -465,51 +518,59 @@ class GridBatchImpl : public torch::CustomClassHolder {
 
     c10::intrusive_ptr<GridBatchImpl> clone(torch::Device device, bool blocking = false) const;
 
-    void checkNonEmptyGrid() const {
+    void
+    checkNonEmptyGrid() const {
         TORCH_CHECK(!isEmpty(), "Empty grid");
     }
 
-    void checkDevice(const torch::Tensor& t) const {
+    void
+    checkDevice(const torch::Tensor &t) const {
         torch::Device hdlDevice = mGridHdl->buffer().device();
-        TORCH_CHECK(hdlDevice == t.device(), "All tensors must be on the same device (" + hdlDevice.str() +
-                                             ") as index grid but got " + t.device().str());
+        TORCH_CHECK(hdlDevice == t.device(), "All tensors must be on the same device (" +
+                                                 hdlDevice.str() + ") as index grid but got " +
+                                                 t.device().str());
     }
 
-    void checkDevice(const JaggedTensor& t) const {
+    void
+    checkDevice(const JaggedTensor &t) const {
         torch::Device hdlDevice = mGridHdl->buffer().device();
-        TORCH_CHECK(hdlDevice == t.device(), "All tensors must be on the same device (" + hdlDevice.str() +
-                                             ") as index grid but got " + t.device().str());
+        TORCH_CHECK(hdlDevice == t.device(), "All tensors must be on the same device (" +
+                                                 hdlDevice.str() + ") as index grid but got " +
+                                                 t.device().str());
     }
 
-    JaggedTensor jaggedTensor(const torch::Tensor& data, bool ignoreDisabledVoxels) const;
+    JaggedTensor jaggedTensor(const torch::Tensor &data, bool ignoreDisabledVoxels) const;
 
-    void setGlobalPrimalTransform(const VoxelCoordTransform& transform, bool syncToDevice = true);
-    void setGlobalDualTransform(const VoxelCoordTransform& transform, bool syncToDevice = true);
-    void setGlobalVoxelSize(const nanovdb::Vec3d& voxelSize, bool syncToDevice = true);
-    void setGlobalVoxelOrigin(const nanovdb::Vec3d& voxelOrigin, bool syncToDevice = true);
-    void setGlobalVoxelSizeAndOrigin(const nanovdb::Vec3d& voxelSize, const nanovdb::Vec3d& voxelOrigin, bool syncToDevice = true);
+    void setGlobalPrimalTransform(const VoxelCoordTransform &transform, bool syncToDevice = true);
+    void setGlobalDualTransform(const VoxelCoordTransform &transform, bool syncToDevice = true);
+    void setGlobalVoxelSize(const nanovdb::Vec3d &voxelSize, bool syncToDevice = true);
+    void setGlobalVoxelOrigin(const nanovdb::Vec3d &voxelOrigin, bool syncToDevice = true);
+    void setGlobalVoxelSizeAndOrigin(const nanovdb::Vec3d &voxelSize,
+                                     const nanovdb::Vec3d &voxelOrigin, bool syncToDevice = true);
 
-    void setFineTransformFromCoarseGrid(const GridBatchImpl& coarseBatch, nanovdb::Coord subdivisionFactor);
-    void setCoarseTransformFromFineGrid(const GridBatchImpl& fineBatch, nanovdb::Coord coarseningFactor);
-    void setPrimalTransformFromDualGrid(const GridBatchImpl& dualBatch);
+    void setFineTransformFromCoarseGrid(const GridBatchImpl &coarseBatch,
+                                        nanovdb::Coord       subdivisionFactor);
+    void setCoarseTransformFromFineGrid(const GridBatchImpl &fineBatch,
+                                        nanovdb::Coord       coarseningFactor);
+    void setPrimalTransformFromDualGrid(const GridBatchImpl &dualBatch);
 
-    void setGrid(nanovdb::GridHandle<TorchDeviceBuffer>&& gridHdl,
-                 const torch::Tensor listIndices,
-                 const std::vector<nanovdb::Vec3d>& voxelSizes,
-                 const std::vector<nanovdb::Vec3d>& voxelOrigins,
-                 bool blocking = false);
+    void setGrid(nanovdb::GridHandle<TorchDeviceBuffer> &&gridHdl, const torch::Tensor listIndices,
+                 const std::vector<nanovdb::Vec3d> &voxelSizes,
+                 const std::vector<nanovdb::Vec3d> &voxelOrigins, bool blocking = false);
 
     c10::intrusive_ptr<GridBatchImpl> index(int64_t bi) const;
     c10::intrusive_ptr<GridBatchImpl> index(ssize_t start, ssize_t stop, ssize_t step) const;
-    c10::intrusive_ptr<GridBatchImpl> index(const torch::Tensor& indices) const;
-    c10::intrusive_ptr<GridBatchImpl> index(const std::vector<int64_t>& indices) const;
-    c10::intrusive_ptr<GridBatchImpl> index(const std::vector<bool>& indices) const;
+    c10::intrusive_ptr<GridBatchImpl> index(const torch::Tensor &indices) const;
+    c10::intrusive_ptr<GridBatchImpl> index(const std::vector<int64_t> &indices) const;
+    c10::intrusive_ptr<GridBatchImpl> index(const std::vector<bool> &indices) const;
 
-    static c10::intrusive_ptr<GridBatchImpl> concatenate(const std::vector<c10::intrusive_ptr<GridBatchImpl>>& elements);
+    static c10::intrusive_ptr<GridBatchImpl>
+    concatenate(const std::vector<c10::intrusive_ptr<GridBatchImpl>> &elements);
 
     static c10::intrusive_ptr<GridBatchImpl> contiguous(c10::intrusive_ptr<GridBatchImpl> input);
 
-    bool isContiguous() const {
+    bool
+    isContiguous() const {
         return mBatchMetadata.mIsContiguous;
     }
 
@@ -517,18 +578,17 @@ class GridBatchImpl : public torch::CustomClassHolder {
     torch::Tensor serialize() const;
 
     // Load a CPU int8 tensor into a grid batch handle
-    static c10::intrusive_ptr<GridBatchImpl> deserialize(const torch::Tensor& serialized);
+    static c10::intrusive_ptr<GridBatchImpl> deserialize(const torch::Tensor &serialized);
 
-private:
+  private:
     // We're going to version serialization. These are v0
-    torch::Tensor serializeV0() const;
-    static c10::intrusive_ptr<GridBatchImpl> deserializeV0(const torch::Tensor& serialized);
-
+    torch::Tensor                            serializeV0() const;
+    static c10::intrusive_ptr<GridBatchImpl> deserializeV0(const torch::Tensor &serialized);
 };
 
-template <typename GridType>
-using BatchGridAccessor = typename GridBatchImpl::Accessor<GridType>;
+template <typename GridType> using BatchGridAccessor = typename GridBatchImpl::Accessor<GridType>;
 
+} // namespace detail
+} // namespace fvdb
 
-}  // namespace detail
-}  // namespace fvdb
+#endif // FVDB_DETAIL_GRIDBATCHIMPL_H
\ No newline at end of file
diff --git a/fvdb/src/detail/TorchDeviceBuffer.cpp b/fvdb/src/detail/TorchDeviceBuffer.cpp
index 0293050cbb..f498a66802 100644
--- a/fvdb/src/detail/TorchDeviceBuffer.cpp
+++ b/fvdb/src/detail/TorchDeviceBuffer.cpp
@@ -6,9 +6,9 @@
 #include <nanovdb/GridHandle.h>
 #include <nanovdb/cuda/DeviceBuffer.h>
 
-#include <cuda_runtime_api.h> // for cudaMalloc/cudaMallocManaged/cudaFree
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <cuda_runtime_api.h> // for cudaMalloc/cudaMallocManaged/cudaFree
 
 namespace nanovdb {
 
@@ -17,89 +17,98 @@ namespace nanovdb {
 // TODO: Pass in synchronous option
 template <>
 template <>
-GridHandle<fvdb::detail::TorchDeviceBuffer> GridHandle<fvdb::detail::TorchDeviceBuffer>::copy(const fvdb::detail::TorchDeviceBuffer& guide) const {
+GridHandle<fvdb::detail::TorchDeviceBuffer>
+GridHandle<fvdb::detail::TorchDeviceBuffer>::copy(
+    const fvdb::detail::TorchDeviceBuffer &guide) const {
     if (mBuffer.isEmpty()) {
         fvdb::detail::TorchDeviceBuffer retbuf(0, nullptr);
         retbuf.setDevice(guide.device(), false);
-        return GridHandle<fvdb::detail::TorchDeviceBuffer>(std::move(retbuf));// return an empty handle
+        return GridHandle<fvdb::detail::TorchDeviceBuffer>(
+            std::move(retbuf)); // return an empty handle
     }
 
-    const bool guideIsHost = guide.device().is_cpu();
-    const bool iAmHost = mBuffer.device().is_cpu();
+    const bool guideIsHost   = guide.device().is_cpu();
+    const bool iAmHost       = mBuffer.device().is_cpu();
     const bool guideIsDevice = !guideIsHost;
-    const bool iAmDevice = !iAmHost;
+    const bool iAmDevice     = !iAmHost;
 
     auto buffer = fvdb::detail::TorchDeviceBuffer::create(mBuffer.size(), &guide, guideIsHost);
 
     if (iAmHost && guideIsHost) {
-        std::memcpy(buffer.data(), mBuffer.data(), mBuffer.size()); // deep copy of buffer in CPU RAM
+        std::memcpy(buffer.data(), mBuffer.data(),
+                    mBuffer.size()); // deep copy of buffer in CPU RAM
     } else if (iAmHost && guideIsDevice) {
         at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(guide.device().index());
-        cudaCheck(cudaMemcpyAsync(buffer.deviceData(), mBuffer.data(), mBuffer.size(), cudaMemcpyHostToDevice, defaultStream.stream()));
+        cudaCheck(cudaMemcpyAsync(buffer.deviceData(), mBuffer.data(), mBuffer.size(),
+                                  cudaMemcpyHostToDevice, defaultStream.stream()));
         cudaCheck(cudaStreamSynchronize(defaultStream.stream()));
     } else if (iAmDevice && guideIsHost) {
-        at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(mBuffer.device().index());
-        cudaCheck(cudaMemcpyAsync(buffer.data(), mBuffer.deviceData(), mBuffer.size(), cudaMemcpyDeviceToHost, defaultStream.stream()));
+        at::cuda::CUDAStream defaultStream =
+            at::cuda::getCurrentCUDAStream(mBuffer.device().index());
+        cudaCheck(cudaMemcpyAsync(buffer.data(), mBuffer.deviceData(), mBuffer.size(),
+                                  cudaMemcpyDeviceToHost, defaultStream.stream()));
         cudaCheck(cudaStreamSynchronize(defaultStream.stream()));
     } else if (iAmDevice && guideIsDevice) {
         if (mBuffer.device() == guide.device()) {
-            at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(mBuffer.device().index());
-            cudaCheck(cudaMemcpyAsync(buffer.deviceData(), mBuffer.deviceData(), mBuffer.size(), cudaMemcpyDeviceToDevice, defaultStream.stream()));
+            at::cuda::CUDAStream defaultStream =
+                at::cuda::getCurrentCUDAStream(mBuffer.device().index());
+            cudaCheck(cudaMemcpyAsync(buffer.deviceData(), mBuffer.deviceData(), mBuffer.size(),
+                                      cudaMemcpyDeviceToDevice, defaultStream.stream()));
             cudaCheck(cudaStreamSynchronize(defaultStream.stream()));
         } else {
             std::unique_ptr<uint8_t[]> buf(new uint8_t[mBuffer.size()]);
-            at::cuda::CUDAStream mBufferStream = at::cuda::getCurrentCUDAStream(mBuffer.device().index());
-            at::cuda::CUDAStream outBufferStream = at::cuda::getCurrentCUDAStream(buffer.device().index());
-            cudaCheck(cudaMemcpyAsync(buf.get(), mBuffer.deviceData(), mBuffer.size(), cudaMemcpyDeviceToHost, mBufferStream.stream()));
+            at::cuda::CUDAStream       mBufferStream =
+                at::cuda::getCurrentCUDAStream(mBuffer.device().index());
+            at::cuda::CUDAStream outBufferStream =
+                at::cuda::getCurrentCUDAStream(buffer.device().index());
+            cudaCheck(cudaMemcpyAsync(buf.get(), mBuffer.deviceData(), mBuffer.size(),
+                                      cudaMemcpyDeviceToHost, mBufferStream.stream()));
             cudaCheck(cudaStreamSynchronize(mBufferStream.stream()));
-            cudaCheck(cudaMemcpyAsync(buffer.deviceData(), buf.get(), mBuffer.size(), cudaMemcpyHostToDevice, outBufferStream.stream()));
+            cudaCheck(cudaMemcpyAsync(buffer.deviceData(), buf.get(), mBuffer.size(),
+                                      cudaMemcpyHostToDevice, outBufferStream.stream()));
             cudaCheck(cudaStreamSynchronize(outBufferStream.stream()));
         }
     }
     return GridHandle<fvdb::detail::TorchDeviceBuffer>(std::move(buffer));
 }
 
-}
+} // namespace nanovdb
 
 namespace fvdb {
 namespace detail {
 
-TorchDeviceBuffer::TorchDeviceBuffer(uint64_t size /* = 0*/, void* data /* = nullptr*/, bool host /* = true*/, int deviceIndex /* = -1*/)
-    : mSize(0)
-    , mCpuData(nullptr)
-    , mGpuData(nullptr)
-    , mDevice(host ? torch::kCPU : torch::kCUDA, deviceIndex) {
-
-    TORCH_CHECK(host || (!host && deviceIndex >= 0), "You must set deviceIndex when setting host to false");
+TorchDeviceBuffer::TorchDeviceBuffer(uint64_t size /* = 0*/, void *data /* = nullptr*/,
+                                     bool host /* = true*/, int deviceIndex /* = -1*/)
+    : mSize(0), mCpuData(nullptr), mGpuData(nullptr),
+      mDevice(host ? torch::kCPU : torch::kCUDA, deviceIndex) {
+    TORCH_CHECK(host || (!host && deviceIndex >= 0),
+                "You must set deviceIndex when setting host to false");
     this->init(size, data, host);
 }
 
-
-TorchDeviceBuffer::TorchDeviceBuffer(TorchDeviceBuffer&& other) noexcept
-    : mSize(other.mSize)
-    , mCpuData(other.mCpuData)
-    , mGpuData(other.mGpuData)
-    , mDevice(other.mDevice) {
-    other.mSize = 0;
+TorchDeviceBuffer::TorchDeviceBuffer(TorchDeviceBuffer &&other) noexcept
+    : mSize(other.mSize), mCpuData(other.mCpuData), mGpuData(other.mGpuData),
+      mDevice(other.mDevice) {
+    other.mSize    = 0;
     other.mCpuData = nullptr;
     other.mGpuData = nullptr;
 }
 
-
-TorchDeviceBuffer& TorchDeviceBuffer::operator=(TorchDeviceBuffer&& other) noexcept {
+TorchDeviceBuffer &
+TorchDeviceBuffer::operator=(TorchDeviceBuffer &&other) noexcept {
     clear();
-    mSize = other.mSize;
-    mCpuData = other.mCpuData;
-    mGpuData = other.mGpuData;
-    mDevice = other.mDevice;
-    other.mSize = 0;
+    mSize          = other.mSize;
+    mCpuData       = other.mCpuData;
+    mGpuData       = other.mGpuData;
+    mDevice        = other.mDevice;
+    other.mSize    = 0;
     other.mCpuData = nullptr;
     other.mGpuData = nullptr;
     return *this;
 }
 
-
-void TorchDeviceBuffer::setDevice(const torch::Device& toDevice, bool blocking) {
+void
+TorchDeviceBuffer::setDevice(const torch::Device &toDevice, bool blocking) {
     // Same device, no-op
     if (toDevice == mDevice) {
         return;
@@ -122,11 +131,10 @@ void TorchDeviceBuffer::setDevice(const torch::Device& toDevice, bool blocking)
     } else {
         TORCH_CHECK(false, "Only CPU and CUDA devices are supported")
     }
-
 }
 
-void TorchDeviceBuffer::toCpu(bool blocking) {
-
+void
+TorchDeviceBuffer::toCpu(bool blocking) {
     // Empty buffer, set the device and return
     if (mGpuData == nullptr && mCpuData == nullptr) {
         mDevice = torch::kCPU;
@@ -148,7 +156,8 @@ void TorchDeviceBuffer::toCpu(bool blocking) {
     mDevice = torch::kCPU;
 }
 
-void TorchDeviceBuffer::toCuda(torch::Device toDevice, bool blocking) {
+void
+TorchDeviceBuffer::toCuda(torch::Device toDevice, bool blocking) {
     TORCH_CHECK(toDevice.is_cuda(), "Invalid device must be a CUDA device");
     TORCH_CHECK(toDevice.has_index(), "Invalid device must specify device index");
 
@@ -168,38 +177,42 @@ void TorchDeviceBuffer::toCuda(torch::Device toDevice, bool blocking) {
         {
             c10::cuda::CUDAGuard deviceGuard(mDevice);
             at::cuda::CUDAStream currentStream = at::cuda::getCurrentCUDAStream(mDevice.index());
-            cudaCheck(cudaMemcpyAsync(buf.get(), mGpuData, mSize, cudaMemcpyDeviceToHost, currentStream.stream()));
+            cudaCheck(cudaMemcpyAsync(buf.get(), mGpuData, mSize, cudaMemcpyDeviceToHost,
+                                      currentStream.stream()));
             cudaCheck(cudaStreamSynchronize(currentStream.stream()));
             c10::cuda::CUDACachingAllocator::raw_delete(mGpuData);
         }
         {
             c10::cuda::CUDAGuard deviceGuard(toDevice);
             at::cuda::CUDAStream toStream = at::cuda::getCurrentCUDAStream(toDevice.index());
-            mGpuData = reinterpret_cast<uint8_t*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(mSize, toStream.stream()));
-            cudaCheck(cudaMemcpyAsync(mGpuData, buf.get(), mSize, cudaMemcpyHostToDevice, toStream.stream()));
+            mGpuData                      = reinterpret_cast<uint8_t *>(
+                c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(mSize, toStream.stream()));
+            cudaCheck(cudaMemcpyAsync(mGpuData, buf.get(), mSize, cudaMemcpyHostToDevice,
+                                      toStream.stream()));
         }
         mDevice = toDevice;
-    } else if (mDevice.is_cpu()) {  // CPU -> CUDA
+    } else if (mDevice.is_cpu()) { // CPU -> CUDA
         TORCH_CHECK(toDevice.has_index(), "Invalid device must specify device index");
         c10::cuda::CUDAGuard deviceGuard(toDevice);
         at::cuda::CUDAStream stream = at::cuda::getCurrentCUDAStream(toDevice.index());
-        copyHostToDeviceAndFreeHost((void*) stream.stream(), blocking);
+        copyHostToDeviceAndFreeHost((void *)stream.stream(), blocking);
         mDevice = toDevice;
     } else {
         TORCH_CHECK(false, "This should never happen. File a bug.")
     }
 }
 
-
-void TorchDeviceBuffer::init(uint64_t size, void* data /* = nullptr */, bool host /* = true */) {
-    TORCH_CHECK((host && mDevice.is_cpu()) || (!host && mDevice.is_cuda()), "Invalid device for host argument to TorchDeviceBuffer::init");
+void
+TorchDeviceBuffer::init(uint64_t size, void *data /* = nullptr */, bool host /* = true */) {
+    TORCH_CHECK((host && mDevice.is_cpu()) || (!host && mDevice.is_cuda()),
+                "Invalid device for host argument to TorchDeviceBuffer::init");
     if (size == mSize) { // If we already initialized the buffer with the same size, just return
         return;
     }
-    if (mSize >= 0) { // If we're initializing to a different size, need to free the old buffer
+    if (mSize >= 0) {    // If we're initializing to a different size, need to free the old buffer
         this->clear();
     }
-    if (size == 0) { // If we're initializing to a size of 0, just return
+    if (size == 0) {     // If we're initializing to a size of 0, just return
         return;
     }
 
@@ -209,26 +222,30 @@ void TorchDeviceBuffer::init(uint64_t size, void* data /* = nullptr */, bool hos
     // Initalize on the host
     if (host) {
         if (data) {
-            mCpuData = (uint8_t*) data;
+            mCpuData = (uint8_t *)data;
         } else {
-            // cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
-            mCpuData = (uint8_t*) malloc(size);
+            // cudaCheck(cudaMallocHost((void**)&mCpuData, size)); // un-managed pinned memory on
+            // the host (can be slow to access!). Always 32B aligned
+            mCpuData = (uint8_t *)malloc(size);
         }
         // checkPtr(mCpuData, "failed to allocate host data");
-    // Initalize on the device
+        // Initalize on the device
     } else {
         if (data) {
-            mGpuData = (uint8_t*) data;
+            mGpuData = (uint8_t *)data;
         } else {
             c10::cuda::CUDAGuard deviceGuard(mDevice);
             at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(mDevice.index());
-            mGpuData = reinterpret_cast<uint8_t*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, defaultStream.stream()));
+            mGpuData =
+                reinterpret_cast<uint8_t *>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(
+                    size, defaultStream.stream()));
             checkPtr(mGpuData, "failed to allocate device data");
         }
     }
 }
 
-void TorchDeviceBuffer::clear() {
+void
+TorchDeviceBuffer::clear() {
     if (mGpuData) {
         c10::cuda::CUDACachingAllocator::raw_delete(mGpuData);
     }
@@ -237,30 +254,37 @@ void TorchDeviceBuffer::clear() {
         free(mCpuData);
     }
     mCpuData = mGpuData = nullptr;
-    mSize = 0;
+    mSize               = 0;
 }
 
-TorchDeviceBuffer TorchDeviceBuffer::create(uint64_t size, const TorchDeviceBuffer* proto, bool host, void* stream) {
-    // This is a hack to pass in the device index when creating grids from nanovdb. Since we can't pass arguments
-    // through nanovdb creation functions, we use a prototype grid to pass in the device index.
+TorchDeviceBuffer
+TorchDeviceBuffer::create(uint64_t size, const TorchDeviceBuffer *proto, bool host, void *stream) {
+    // This is a hack to pass in the device index when creating grids from nanovdb. Since we can't
+    // pass arguments through nanovdb creation functions, we use a prototype grid to pass in the
+    // device index.
     int deviceId = -1;
     if (proto != nullptr) {
-        TORCH_CHECK((host && proto->device().is_cpu()) || (!host && proto->device().is_cuda()), "Invalid guide buffer device for host argument to TorchDeviceBuffer::create");
+        TORCH_CHECK((host && proto->device().is_cpu()) || (!host && proto->device().is_cuda()),
+                    "Invalid guide buffer device for host argument to TorchDeviceBuffer::create");
         deviceId = proto->mDevice.index();
     }
     return TorchDeviceBuffer(size, nullptr, host, host ? -1 : deviceId);
 }
 
-void TorchDeviceBuffer::copyDeviceToHostAndFreeDevice(void* streamPtr /* = 0*/, bool blocking /* = true*/) {
+void
+TorchDeviceBuffer::copyDeviceToHostAndFreeDevice(void *streamPtr /* = 0*/,
+                                                 bool  blocking /* = true*/) {
     cudaStream_t stream = reinterpret_cast<cudaStream_t>(streamPtr);
 
     TORCH_CHECK(mGpuData, "uninitialized cpu data, this should never happen");
     if (mCpuData == nullptr) { // Allocate CPU data if we upload to the device
-        // cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the host (can be slow to access!). Always 32B aligned
-        mCpuData = (uint8_t*) malloc(mSize);
+        // cudaCheck(cudaMallocHost((void**)&mCpuData, mSize)); // un-managed pinned memory on the
+        // host (can be slow to access!). Always 32B aligned
+        mCpuData = (uint8_t *)malloc(mSize);
     }
     // Copy to the host buffer
-    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost, reinterpret_cast<cudaStream_t>(stream)));
+    cudaCheck(cudaMemcpyAsync(mCpuData, mGpuData, mSize, cudaMemcpyDeviceToHost,
+                              reinterpret_cast<cudaStream_t>(stream)));
     if (blocking) {
         cudaCheck(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
     }
@@ -268,12 +292,15 @@ void TorchDeviceBuffer::copyDeviceToHostAndFreeDevice(void* streamPtr /* = 0*/,
     c10::cuda::CUDACachingAllocator::raw_delete(mGpuData);
 }
 
-void TorchDeviceBuffer::copyHostToDeviceAndFreeHost(void* streamPtr /* = 0*/, bool blocking /* = true*/) {
+void
+TorchDeviceBuffer::copyHostToDeviceAndFreeHost(void *streamPtr /* = 0*/,
+                                               bool  blocking /* = true*/) {
     cudaStream_t stream = reinterpret_cast<cudaStream_t>(streamPtr);
 
     TORCH_CHECK(mCpuData, "uninitialized cpu data, this should never happen");
-    if (mGpuData == nullptr) {  // Allocate a new CUDA buffer
-        mGpuData = reinterpret_cast<uint8_t*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(mSize, stream));
+    if (mGpuData == nullptr) { // Allocate a new CUDA buffer
+        mGpuData = reinterpret_cast<uint8_t *>(
+            c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(mSize, stream));
     }
     // Copy the data to the CUDA buffer
     cudaCheck(cudaMemcpyAsync(mGpuData, mCpuData, mSize, cudaMemcpyHostToDevice, stream));
diff --git a/fvdb/src/detail/TorchDeviceBuffer.h b/fvdb/src/detail/TorchDeviceBuffer.h
index 4a89d8eb59..bda65b912e 100644
--- a/fvdb/src/detail/TorchDeviceBuffer.h
+++ b/fvdb/src/detail/TorchDeviceBuffer.h
@@ -1,12 +1,12 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
-#include <torch/types.h>
+#ifndef FVDB_DETAIL_TORCHDEVICEBUFFER_H
+#define FVDB_DETAIL_TORCHDEVICEBUFFER_H
 
 #include <nanovdb/HostBuffer.h> // for BufferTraits
 
+#include <torch/types.h>
 
 namespace fvdb {
 namespace detail {
@@ -16,56 +16,65 @@ namespace detail {
 /// @brief Simple memory buffer using un-managed pinned host memory when compiled with NVCC.
 ///        Obviously this class is making explicit used of CUDA so replace it with your own memory
 ///        allocator if you are not using CUDA.
-/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and device
+/// @note  While CUDA's pinned host memory allows for asynchronous memory copy between host and
+/// device
 ///        it is significantly slower then cached (un-pinned) memory on the host.
-class TorchDeviceBuffer
-{
-    uint64_t mSize; // total number of bytes for the NanoVDB grid.
-    uint8_t *mCpuData, *mGpuData; // raw buffer for the NanoVDB grid.
+class TorchDeviceBuffer {
+    uint64_t      mSize;               // total number of bytes for the NanoVDB grid.
+    uint8_t      *mCpuData, *mGpuData; // raw buffer for the NanoVDB grid.
     torch::Device mDevice = torch::Device(torch::kCPU);
 
-    /// @brief Helper function to move this buffer to the CPU. If the buffer is on the GPU, the GPU memory will be freed.
-    /// @param blocking If set to false, then memory allocations and copies are performed asynchronously
+    /// @brief Helper function to move this buffer to the CPU. If the buffer is on the GPU, the GPU
+    /// memory will be freed.
+    /// @param blocking If set to false, then memory allocations and copies are performed
+    /// asynchronously
     void toCpu(bool blocking);
 
     /// @brief Helper function to move this buffer to the specified CUDA device.
     /// @param device The device on which the buffer should be moved to
-    /// @param blocking If set to false, then memory allocations and copies are performed asynchronously
+    /// @param blocking If set to false, then memory allocations and copies are performed
+    /// asynchronously
     void toCuda(torch::Device device, bool blocking);
 
-    /// @brief Helper function to copy from the host to the device and then free the host buffer. If @c blocking is false the memory copy is asynchronous!
+    /// @brief Helper function to copy from the host to the device and then free the host buffer. If
+    /// @c blocking is false the memory copy is asynchronous!
     ///
     /// @note This will allocate memory on the GPU/device if it is not already allocated.
     /// @note The device of this buffer must be CPU
-    void copyHostToDeviceAndFreeHost(void* stream = 0, bool blocking = true);  // Delete
+    void copyHostToDeviceAndFreeHost(void *stream = 0, bool blocking = true); // Delete
 
-    /// @brief Helper function to copy from a device to the host and then free the device buffer. If @c blocking is false the memory copy is asynchronous!
+    /// @brief Helper function to copy from a device to the host and then free the device buffer. If
+    /// @c blocking is false the memory copy is asynchronous!
     ///
     /// @note This will allocate memory on the host if it is not already allocated.
     /// @note The device of this buffer must be CPU
-    void copyDeviceToHostAndFreeDevice(void* stream = 0, bool blocking = true);  // Delete
+    void copyDeviceToHostAndFreeDevice(void *stream = 0, bool blocking = true); // Delete
 
-public:
-    /// @brief Default constructor initializes a buffer with the given size and device specified by host and deviceIndex.
-    /// @note This has a weird API because it has to match other buffer classes in nanovdb like nanovdb::HostBuffer
+  public:
+    /// @brief Default constructor initializes a buffer with the given size and device specified by
+    /// host and deviceIndex.
+    /// @note This has a weird API because it has to match other buffer classes in nanovdb like
+    /// nanovdb::HostBuffer
     /// @param size The size (in bytes to allocate for this buffer)
     /// @param data If non-null, the data pointer to use for this buffer
     /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU
-    /// @param deviceIndex If host is false, then this specifies the device index to use for the buffer
+    /// @param deviceIndex If host is false, then this specifies the device index to use for the
+    /// buffer
     ///                    (must be set to a nonzero value when host is false)
-    TorchDeviceBuffer(uint64_t size = 0, void* data = nullptr, bool host = true, int deviceIndex = -1);
+    TorchDeviceBuffer(uint64_t size = 0, void *data = nullptr, bool host = true,
+                      int deviceIndex = -1);
 
     /// @brief Disallow copy-construction
-    TorchDeviceBuffer(const TorchDeviceBuffer&) = delete;
+    TorchDeviceBuffer(const TorchDeviceBuffer &) = delete;
 
     /// @brief Move copy-constructor
-    TorchDeviceBuffer(TorchDeviceBuffer&& other) noexcept;
+    TorchDeviceBuffer(TorchDeviceBuffer &&other) noexcept;
 
     /// @brief Disallow copy assignment operation
-    TorchDeviceBuffer& operator=(const TorchDeviceBuffer&) = delete;
+    TorchDeviceBuffer &operator=(const TorchDeviceBuffer &) = delete;
 
     /// @brief Move copy assignment operation
-    TorchDeviceBuffer& operator=(TorchDeviceBuffer&& other) noexcept;
+    TorchDeviceBuffer &operator=(TorchDeviceBuffer &&other) noexcept;
 
     /// @brief Destructor frees memory on both the host and device
     ~TorchDeviceBuffer() { this->clear(); };
@@ -76,56 +85,74 @@ class TorchDeviceBuffer
     ///             The selected device will be this->device which must be a cuda device
     /// @note All existing buffers are first cleared
     /// @warning size is expected to be non-zero. Use clear() clear buffer!
-    void init(uint64_t size, void* data = nullptr, bool host = true);
+    void init(uint64_t size, void *data = nullptr, bool host = true);
 
     /// @brief Set the device of this buffer and shuffle data around accordingly
     /// @param device The device to be used by this buffer (if CUDA, must specify a device index)
     /// @param blocking If true the memory copy is synchronous, else asynchronous
-    void setDevice(const torch::Device& device, bool blocking);
+    void setDevice(const torch::Device &device, bool blocking);
 
     /// @brief Returns the device used by this buffer
     /// @return The device used by this buffer
-    const torch::Device& device() const {
+    const torch::Device &
+    device() const {
         return mDevice;
     }
 
     /// @brief Retuns a pointer to the raw memory buffer managed by this allocator.
     /// @warning Note that the pointer can be NULL is the allocator was not initialized!
-    uint8_t* data() const { return mCpuData; }
-    uint8_t* deviceData() const { return mGpuData; }
+    uint8_t *
+    data() const {
+        return mCpuData;
+    }
+    uint8_t *
+    deviceData() const {
+        return mGpuData;
+    }
 
     /// @brief Returns the size in bytes of the raw memory buffer managed by this allocator.
-    uint64_t size() const { return mSize; }
+    uint64_t
+    size() const {
+        return mSize;
+    }
 
     /// @brief Returns true if this allocator is empty, i.e. has no allocated memory
-    bool empty() const { return mSize == 0 && mCpuData == nullptr && mGpuData == nullptr; }
-    bool isEmpty() const { return empty(); }
+    bool
+    empty() const {
+        return mSize == 0 && mCpuData == nullptr && mGpuData == nullptr;
+    }
+    bool
+    isEmpty() const {
+        return empty();
+    }
 
     /// @brief De-allocate all memory managed by this allocator and set all pointer to NULL
     void clear();
 
     /// @brief Static factory method that return an instance of this buffer
     /// @param size byte size of buffer to be initialized
-    /// @param guide this argument is there to match the signature of the other create() methods (e.g. nanovdb::HostBuffer)
+    /// @param guide this argument is there to match the signature of the other create() methods
+    /// (e.g. nanovdb::HostBuffer)
     ///              and to provide a way to specify the device to be used for the buffer.
-    ///              i.e. if guide is non-null, the created buffer will be on the same device as guide!
-    ///              note you must also set the host argument to match the guide buffer device
-    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU. If you passed in a guide
+    ///              i.e. if guide is non-null, the created buffer will be on the same device as
+    ///              guide! note you must also set the host argument to match the guide buffer
+    ///              device
+    /// @param host If true buffer is initialized only on the host/CPU, else on the device/GPU. If
+    /// you passed in a guide
     ///             buffer, then this must match the device of the guide buffer!
     /// @return An instance of this class using move semantics
-    static TorchDeviceBuffer create(uint64_t size, const TorchDeviceBuffer* guide = nullptr, bool host = true, void* stream = nullptr);
+    static TorchDeviceBuffer create(uint64_t size, const TorchDeviceBuffer *guide = nullptr,
+                                    bool host = true, void *stream = nullptr);
 
 }; // TorchDeviceBuffer class
 
-
 } // namespace detail
 } // namespace fvdb
 
-
 namespace nanovdb {
-    template<>
-    struct BufferTraits<fvdb::detail::TorchDeviceBuffer>
-    {
-        static const bool hasDeviceDual = true;
-    };
+template <> struct BufferTraits<fvdb::detail::TorchDeviceBuffer> {
+    static const bool hasDeviceDual = true;
+};
 } // namespace nanovdb
+
+#endif // FVDB_DETAIL_TORCHDEVICEBUFFER_H
\ No newline at end of file
diff --git a/fvdb/src/detail/TypesImpl.h b/fvdb/src/detail/TypesImpl.h
index eb957f713d..028fa49bfb 100644
--- a/fvdb/src/detail/TypesImpl.h
+++ b/fvdb/src/detail/TypesImpl.h
@@ -1,41 +1,47 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <nanovdb/NanoVDB.h>
+#ifndef FVDB_DETAIL_TYPESIMPL_H
+#define FVDB_DETAIL_TYPESIMPL_H
+
+#include <detail/utils/nanovdb/TorchNanoConversions.h>
 
+#include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 
-template <bool AllowScalar>
-class Vec3dImpl {
+template <bool AllowScalar> class Vec3dImpl {
     nanovdb::Vec3d mValue;
-    bool mWasScalar = false;
+    bool           mWasScalar = false;
 
-public:
+  public:
     static constexpr bool SupportsScalarCast = AllowScalar;
-    using ValueType = nanovdb::Vec3d::ValueType;
+    using ValueType                          = nanovdb::Vec3d::ValueType;
 
     Vec3dImpl() : mValue(0.0, 0.0, 0.0) {}
-    Vec3dImpl(const nanovdb::Vec3d& coord) : mValue(coord) {}
-    Vec3dImpl(const nanovdb::Vec3f& coord) : mValue(coord[0], coord[1], coord[2]) {}
-    Vec3dImpl(const torch::Tensor& coordTensor) : mValue(fvdb::tensorToVec3d(coordTensor)) {}
-    template <typename T>
-    Vec3dImpl(const std::vector<T>& coordVec) {
-        static_assert(std::is_arithmetic<T>::value, "Coord3D can only be constructed from integral types");
-        TORCH_CHECK_VALUE(coordVec.size() == 3, "Coord3D can only be constructed from a vector of size 3");
+    Vec3dImpl(const nanovdb::Vec3d &coord) : mValue(coord) {}
+    Vec3dImpl(const nanovdb::Vec3f &coord) : mValue(coord[0], coord[1], coord[2]) {}
+    Vec3dImpl(const torch::Tensor &coordTensor) : mValue(fvdb::tensorToVec3d(coordTensor)) {}
+    template <typename T> Vec3dImpl(const std::vector<T> &coordVec) {
+        static_assert(std::is_arithmetic<T>::value,
+                      "Coord3D can only be constructed from integral types");
+        TORCH_CHECK_VALUE(coordVec.size() == 3,
+                          "Coord3D can only be constructed from a vector of size 3");
         mValue = nanovdb::Vec3d(coordVec[0], coordVec[1], coordVec[2]);
     }
 
-    template <typename T>
-    Vec3dImpl(T scalar) {
-        static_assert(AllowScalar, "Vec3d can only be constructed from a scalar if AllowScalar is true");
-        static_assert(std::is_arithmetic<T>::value, "Vec3d can only be constructed from numeric types");
-        mValue = nanovdb::Vec3d(scalar, scalar, scalar);
+    template <typename T> Vec3dImpl(T scalar) {
+        static_assert(AllowScalar,
+                      "Vec3d can only be constructed from a scalar if AllowScalar is true");
+        static_assert(std::is_arithmetic<T>::value,
+                      "Vec3d can only be constructed from numeric types");
+        mValue     = nanovdb::Vec3d(scalar, scalar, scalar);
         mWasScalar = true;
     }
 
-    const nanovdb::Vec3d& value() const {
+    const nanovdb::Vec3d &
+    value() const {
         if constexpr (!AllowScalar) {
             TORCH_CHECK_VALUE(!mWasScalar, "Expected a vector, but got a scalar");
         }
@@ -43,83 +49,88 @@ class Vec3dImpl {
     }
 };
 
-
-template <bool AllowScalar>
-class Coord3Impl {
+template <bool AllowScalar> class Coord3Impl {
     nanovdb::Coord mValue;
-    bool mWasScalar = false;
+    bool           mWasScalar = false;
 
-public:
+  public:
     static constexpr bool SupportsScalarCast = AllowScalar;
-    using ValueType = nanovdb::Coord::ValueType;
+    using ValueType                          = nanovdb::Coord::ValueType;
 
     Coord3Impl() : mValue(0, 0, 0) {}
-    Coord3Impl(const nanovdb::Coord& coord) : mValue(coord) {}
-    Coord3Impl(const nanovdb::Vec3i& coord) : mValue(coord[0], coord[1], coord[2]) {}
-    Coord3Impl(const nanovdb::Vec3u& coord) : mValue(coord[0], coord[1], coord[2]) {}
-    Coord3Impl(const torch::Tensor& coordTensor) : mValue(fvdb::tensorToCoord(coordTensor)) {}
-    template <typename T>
-    Coord3Impl(const std::vector<T>& coordVec) {
-        static_assert(std::is_integral<T>::value, "Coord can only be constructed from integral types");
-        TORCH_CHECK_VALUE(coordVec.size() == 3, "Coord can only be constructed from a vector of size 3");
+    Coord3Impl(const nanovdb::Coord &coord) : mValue(coord) {}
+    Coord3Impl(const nanovdb::Vec3i &coord) : mValue(coord[0], coord[1], coord[2]) {}
+    Coord3Impl(const nanovdb::Vec3u &coord) : mValue(coord[0], coord[1], coord[2]) {}
+    Coord3Impl(const torch::Tensor &coordTensor) : mValue(fvdb::tensorToCoord(coordTensor)) {}
+    template <typename T> Coord3Impl(const std::vector<T> &coordVec) {
+        static_assert(std::is_integral<T>::value,
+                      "Coord can only be constructed from integral types");
+        TORCH_CHECK_VALUE(coordVec.size() == 3,
+                          "Coord can only be constructed from a vector of size 3");
         mValue = nanovdb::Coord(coordVec[0], coordVec[1], coordVec[2]);
     }
-    template <typename T>
-    Coord3Impl(T scalar) {
-        static_assert(AllowScalar, "Coord3 can only be constructed from a scalar if AllowScalar is true");
-        static_assert(std::is_integral<T>::value, "Coord3D can only be constructed from integral types");
-        mValue = nanovdb::Coord(scalar, scalar, scalar);
+    template <typename T> Coord3Impl(T scalar) {
+        static_assert(AllowScalar,
+                      "Coord3 can only be constructed from a scalar if AllowScalar is true");
+        static_assert(std::is_integral<T>::value,
+                      "Coord3D can only be constructed from integral types");
+        mValue     = nanovdb::Coord(scalar, scalar, scalar);
         mWasScalar = true;
     }
 
-    const nanovdb::Coord& value() const {
+    const nanovdb::Coord &
+    value() const {
         if constexpr (!AllowScalar) {
             TORCH_CHECK_VALUE(!mWasScalar, "Expected a vector, but got a scalar");
         }
         return mValue;
     }
 
-    std::string toString() const {
-        return "{" + std::to_string(mValue[0]) + ", " + std::to_string(mValue[1]) + ", " + std::to_string(mValue[2]) + "}";
+    std::string
+    toString() const {
+        return "{" + std::to_string(mValue[0]) + ", " + std::to_string(mValue[1]) + ", " +
+               std::to_string(mValue[2]) + "}";
     }
 };
 
-
-template <bool AllowScalar>
-class Coord4Impl {
+template <bool AllowScalar> class Coord4Impl {
     nanovdb::Vec4i mValue;
-    static_assert(!AllowScalar, "Coord does not allow scalar conversion. We may wish to change this in the future.");
+    static_assert(
+        !AllowScalar,
+        "Coord does not allow scalar conversion. We may wish to change this in the future.");
 
-public:
+  public:
     static constexpr bool SupportsScalarCast = AllowScalar;
-    using ValueType = nanovdb::Coord::ValueType;
+    using ValueType                          = nanovdb::Coord::ValueType;
 
     Coord4Impl() : mValue(0, 0, 0, 0) {}
-    Coord4Impl(const nanovdb::Vec4i& coord) : mValue(coord) {}
-    Coord4Impl(const torch::Tensor& coordTensor) : mValue(fvdb::tensorToCoord4(coordTensor)) {}
-    template <typename T>
-    Coord4Impl(const std::vector<T>& coordVec) {
-        static_assert(std::is_integral<T>::value, "Vec4i can only be constructed from integral types");
-        TORCH_CHECK_VALUE(coordVec.size() == 4, "Vec4i can only be constructed from a vector of size 4");
+    Coord4Impl(const nanovdb::Vec4i &coord) : mValue(coord) {}
+    Coord4Impl(const torch::Tensor &coordTensor) : mValue(fvdb::tensorToCoord4(coordTensor)) {}
+    template <typename T> Coord4Impl(const std::vector<T> &coordVec) {
+        static_assert(std::is_integral<T>::value,
+                      "Vec4i can only be constructed from integral types");
+        TORCH_CHECK_VALUE(coordVec.size() == 4,
+                          "Vec4i can only be constructed from a vector of size 4");
         mValue = nanovdb::Vec4i(coordVec[0], coordVec[1], coordVec[2], coordVec[3]);
     }
 
-    const nanovdb::Vec4i& value() const {
+    const nanovdb::Vec4i &
+    value() const {
         return mValue;
     }
 };
 
-
-template <typename VecT, bool AllowScalar, bool AllowBroadcast>
-class Vec3BatchImpl {
-private:
+template <typename VecT, bool AllowScalar, bool AllowBroadcast> class Vec3BatchImpl {
+  private:
     std::vector<VecT> mValue;
-    bool isScalar = false;
-    bool isSingle = false;
+    bool              isScalar = false;
+    bool              isSingle = false;
 
-    std::vector<VecT> repeatIt(int64_t batchSize, bool onlyPositive) const {
+    std::vector<VecT>
+    repeatIt(int64_t batchSize, bool onlyPositive) const {
         if (onlyPositive) {
-            TORCH_CHECK_VALUE(mValue[0][0] > 0 && mValue[0][1] > 0 && mValue[0][2] > 0, "Expected all coordinates to be positive");
+            TORCH_CHECK_VALUE(mValue[0][0] > 0 && mValue[0][1] > 0 && mValue[0][2] > 0,
+                              "Expected all coordinates to be positive");
         }
         std::vector<VecT> result;
         result.reserve(batchSize);
@@ -129,16 +140,16 @@ class Vec3BatchImpl {
         return result;
     }
 
-public:
-    static constexpr bool SupportsBroadcast = AllowBroadcast;
+  public:
+    static constexpr bool SupportsBroadcast  = AllowBroadcast;
     static constexpr bool SupportsScalarCast = AllowScalar;
 
     using ValueType = typename VecT::ValueType;
-    using VecType = VecT;
+    using VecType   = VecT;
 
     Vec3BatchImpl() : mValue() {}
 
-    Vec3BatchImpl(const torch::Tensor& tensor) {
+    Vec3BatchImpl(const torch::Tensor &tensor) {
         torch::Tensor squeezed = tensor.squeeze().cpu();
 
         if constexpr (AllowScalar) {
@@ -151,60 +162,74 @@ class Vec3BatchImpl {
 
         if constexpr (AllowBroadcast) {
             if (squeezed.numel() == 3) {
-                mValue.push_back(VecT(squeezed[0].item<double>(), squeezed[1].item<double>(), squeezed[2].item<double>()));
+                mValue.push_back(VecT(squeezed[0].item<double>(), squeezed[1].item<double>(),
+                                      squeezed[2].item<double>()));
                 isSingle = true;
                 return;
             }
         }
 
-        TORCH_CHECK_VALUE(squeezed.dim() == 2, "Expected a batch of 3D coordinates with size [B, 3]");
-        TORCH_CHECK_VALUE(squeezed.size(1) == 3, "Expected a batch of 3D coordinates with size [B, 3]");
+        TORCH_CHECK_VALUE(squeezed.dim() == 2,
+                          "Expected a batch of 3D coordinates with size [B, 3]");
+        TORCH_CHECK_VALUE(squeezed.size(1) == 3,
+                          "Expected a batch of 3D coordinates with size [B, 3]");
         mValue.reserve(squeezed.size(0));
         for (int i = 0; i < squeezed.size(0); ++i) {
-            mValue.push_back(VecT(squeezed[i][0].item<double>(), squeezed[i][1].item<double>(), squeezed[i][2].item<double>()));
+            mValue.push_back(VecT(squeezed[i][0].item<double>(), squeezed[i][1].item<double>(),
+                                  squeezed[i][2].item<double>()));
         }
     }
 
-    template <typename T>
-    Vec3BatchImpl(const std::vector<std::vector<T>>& vectorData) {
+    template <typename T> Vec3BatchImpl(const std::vector<std::vector<T>> &vectorData) {
         if constexpr (nanovdb::util::is_same<VecT, nanovdb::Coord>::value) {
-            static_assert(std::is_integral<T>::value, "Vec3Batch can only be constructed from integral types");
+            static_assert(std::is_integral<T>::value,
+                          "Vec3Batch can only be constructed from integral types");
         }
-        static_assert(std::is_arithmetic<T>::value, "Vec3Batch can only be constructed from numeric types");
+        static_assert(std::is_arithmetic<T>::value,
+                      "Vec3Batch can only be constructed from numeric types");
         size_t batchSize = vectorData.size();
         TORCH_CHECK_VALUE(batchSize > 0, "Expected a batch of coordinates with size [B, 3]");
         for (size_t i = 0; i < batchSize; i += 1) {
-            TORCH_CHECK_VALUE(vectorData[i].size() == 3, "Expected a batch of 3D coordinates with size [B, 3]");
+            TORCH_CHECK_VALUE(vectorData[i].size() == 3,
+                              "Expected a batch of 3D coordinates with size [B, 3]");
             mValue.push_back(VecT(vectorData[i][0], vectorData[i][1], vectorData[i][2]));
         }
     }
 
-    template <typename T>
-    Vec3BatchImpl(const T& scalar) {
-        static_assert(AllowScalar, "Cannot construct Vec3Batch from scalar when AllowScalar is set to false");
+    template <typename T> Vec3BatchImpl(const T &scalar) {
+        static_assert(AllowScalar,
+                      "Cannot construct Vec3Batch from scalar when AllowScalar is set to false");
 
         if constexpr (nanovdb::util::is_same<VecT, nanovdb::Coord>::value) {
-            static_assert(std::is_integral<T>::value, "Vec3Batch can only be constructed from integral types");
+            static_assert(std::is_integral<T>::value,
+                          "Vec3Batch can only be constructed from integral types");
         }
-        static_assert(std::is_arithmetic<T>::value, "Vec3Batch can only be constructed from numeric types");
-        mValue.push_back(VecT((double) scalar));
+        static_assert(std::is_arithmetic<T>::value,
+                      "Vec3Batch can only be constructed from numeric types");
+        mValue.push_back(VecT((double)scalar));
         isScalar = true;
     }
 
-    template <typename T>
-    Vec3BatchImpl(const std::vector<T>& vec) {
-        static_assert(AllowBroadcast, "Cannot construct Vec3Batch from single vector when AllowBroadcast is set to false");
+    template <typename T> Vec3BatchImpl(const std::vector<T> &vec) {
+        static_assert(
+            AllowBroadcast,
+            "Cannot construct Vec3Batch from single vector when AllowBroadcast is set to false");
 
         if constexpr (nanovdb::util::is_same<VecT, nanovdb::Coord>::value) {
-            static_assert(std::is_integral<T>::value, "Vec3Batch can only be constructed from integral types");
+            static_assert(std::is_integral<T>::value,
+                          "Vec3Batch can only be constructed from integral types");
         }
-        static_assert(std::is_arithmetic<T>::value, "Vec3Batch can only be constructed from numeric types");
-        TORCH_CHECK_VALUE(vec.size() == 3, "Expected a batch of 3D coordinates with size [B, 3] or a single coordinate of size [3,]");
+        static_assert(std::is_arithmetic<T>::value,
+                      "Vec3Batch can only be constructed from numeric types");
+        TORCH_CHECK_VALUE(
+            vec.size() == 3,
+            "Expected a batch of 3D coordinates with size [B, 3] or a single coordinate of size [3,]");
         mValue.push_back(VecT(vec[0], vec[1], vec[2]));
         isSingle = true;
     }
 
-    std::vector<VecT> value(uint64_t batchSize, bool onlyPositive, std::string name) const {
+    std::vector<VecT>
+    value(uint64_t batchSize, bool onlyPositive, std::string name) const {
         TORCH_CHECK(batchSize > 0, "Can't request empty batch of coordinates");
         TORCH_CHECK(mValue.size() > 0, "Can't request empty batch of coordinates");
 
@@ -212,7 +237,6 @@ class Vec3BatchImpl {
             if (isScalar) {
                 return repeatIt(batchSize, onlyPositive);
             }
-
         }
         if constexpr (AllowBroadcast) {
             if (isSingle && batchSize != 1) {
@@ -222,20 +246,24 @@ class Vec3BatchImpl {
 
         if (onlyPositive) {
             for (size_t i = 0; i < mValue.size(); ++i) {
-                TORCH_CHECK_VALUE(mValue[i][0] > 0 && mValue[i][1] > 0 && mValue[i][2] > 0, "Expected all coordinates of " + name + " to be positive");
+                TORCH_CHECK_VALUE(mValue[i][0] > 0 && mValue[i][1] > 0 && mValue[i][2] > 0,
+                                  "Expected all coordinates of " + name + " to be positive");
             }
         }
-        TORCH_CHECK_VALUE(batchSize == mValue.size(), "Expected " + name + " batch of 3D coordinates to have size [" + std::to_string(batchSize) + ", 3]");
+        TORCH_CHECK_VALUE(batchSize == mValue.size(),
+                          "Expected " + name + " batch of 3D coordinates to have size [" +
+                              std::to_string(batchSize) + ", 3]");
         return mValue;
     }
 
-    torch::Tensor tensorValue(uint64_t batchSize, bool onlyPositive, std::string name) const {
+    torch::Tensor
+    tensorValue(uint64_t batchSize, bool onlyPositive, std::string name) const {
         std::vector<VecT> vec = value(batchSize, onlyPositive, name);
 
         if constexpr (nanovdb::util::is_same<VecT, nanovdb::Coord>::value) {
-            return torch::from_blob(vec.data(), { (int64_t) vec.size(), 3 }, torch::kInt32).clone();
+            return torch::from_blob(vec.data(), { (int64_t)vec.size(), 3 }, torch::kInt32).clone();
         } else if constexpr (nanovdb::util::is_same<VecT, nanovdb::Vec3d>::value) {
-            return torch::from_blob(vec.data(), { (int64_t) vec.size(), 3 }, torch::kDouble).clone();
+            return torch::from_blob(vec.data(), { (int64_t)vec.size(), 3 }, torch::kDouble).clone();
         } else {
             static_assert("Only Coord and Vec3d are supported for now");
         }
@@ -244,3 +272,5 @@ class Vec3BatchImpl {
 
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_TYPESIMPL_H
\ No newline at end of file
diff --git a/fvdb/src/detail/VoxelCoordTransform.h b/fvdb/src/detail/VoxelCoordTransform.h
index bf69328c5f..f98859732a 100644
--- a/fvdb/src/detail/VoxelCoordTransform.h
+++ b/fvdb/src/detail/VoxelCoordTransform.h
@@ -1,86 +1,93 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_VOXELCOORDTRANSFORM_H
+#define FVDB_DETAIL_VOXELCOORDTRANSFORM_H
+
+#include "utils/Utils.h"
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/math/Ray.h>
 
-#include "detail/utils/Utils.h"
-
-
 namespace fvdb {
 namespace detail {
 
 /// @brief A class representing the the transformation from world space (xyz) to voxel space (ijk)
-///        its inverse, and gradient. It can be applied to points, vectors and rays. It stores the transformation in
-///        float16, float32 and float64 precision, using the appropriate representation depending on the
-///        input types.
+///        its inverse, and gradient. It can be applied to points, vectors and rays. It stores the
+///        transformation in float16, float32 and float64 precision, using the appropriate
+///        representation depending on the input types.
 /// @note  This class currently only supports translation and non-uniform scaling transformations.
 struct VoxelCoordTransform {
-
     /// @brief Construct a voxel coordinate transform with identity transformation
     /// @return The voxel coordinate transform
     __hostdev__ VoxelCoordTransform() {};
 
-    /// @brief Construct a voxel coordinate transform that scales and translates each input point when mappint to voxel coordinates
+    /// @brief Construct a voxel coordinate transform that scales and translates each input point
+    /// when mappint to voxel coordinates
     /// @param scale The 3D scale to apply to each input point
     /// @param translate The 3D translation to apply to each input point
-    __hostdev__ VoxelCoordTransform(const nanovdb::Vec3d& scale, const nanovdb::Vec3d& translate) : mTransform(scale, translate) {}
+    __hostdev__
+    VoxelCoordTransform(const nanovdb::Vec3d &scale, const nanovdb::Vec3d &translate)
+        : mTransform(scale, translate) {}
 
     /// @brief Apply the gradient of the transformation (from xyz to ijk) to an input point xyz
     /// @tparam ScalarT The scalar type of the input point xyz
     /// @param xyz The input point to apply the gradient to
     /// @return The gradient dT/dxyz of the transformation applied to xyz
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyGrad(const nanovdb::math::Vec3<ScalarT>& xyz) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyGrad(const nanovdb::math::Vec3<ScalarT> &xyz) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return mTransform.scale<ScalarT>();
     }
 
-    /// @brief Apply the gradient of the transformation (from xyz to ijk) to an input point (x, y, z)
+    /// @brief Apply the gradient of the transformation (from xyz to ijk) to an input point (x, y,
+    /// z)
     /// @tparam ScalarT The scalar type of the input point (x, y, z)
     /// @param x The x component of the input point to apply the gradient to
     /// @param y The y component of the input point to apply the gradient to
     /// @param z The z component of the input point to apply the gradient to
     /// @return The gradient dT/d(x, y, z) of the transformation applied to (x, y, z)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyGrad(ScalarT x, ScalarT y, ScalarT z) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyGrad(ScalarT x, ScalarT y, ScalarT z) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return mTransform.scale<ScalarT>();
     }
 
-
-    /// @brief Apply the gradient of the inverse transformation (from ijk to xyz) to an input coordinate ijk
+    /// @brief Apply the gradient of the inverse transformation (from ijk to xyz) to an input
+    /// coordinate ijk
     /// @tparam ScalarT The scalar type of the input coordinate ijk
     /// @param ijk The input point to apply the gradient to
     /// @return The gradient dT^-1/dijk of the inverse transformation applied to ijk
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyInvGrad(const nanovdb::math::Vec3<ScalarT>& ijk) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyInvGrad(const nanovdb::math::Vec3<ScalarT> &ijk) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return nanovdb::math::Vec3<ScalarT>(1.0, 1.0, 1.0) / mTransform.scale<ScalarT>();
     }
 
-    /// @brief Apply the gradient of the inverse transformation (from ijk to xyz) to an input coordinate (i, j, k)
+    /// @brief Apply the gradient of the inverse transformation (from ijk to xyz) to an input
+    /// coordinate (i, j, k)
     /// @tparam ScalarT ScalarT The scalar type of the input coordinate (i, j, k)
     /// @param i The i component of the input coordinate to apply the gradient to
     /// @param j The j component of the input coordinate to apply the gradient to
     /// @param k The k component of the input coordinate to apply the gradient to
     /// @return The gradient dT^-1/d(i, j, k) of the inverse transformation applied to (i, j, k)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyInvGrad(ScalarT i, ScalarT j, ScalarT k) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyInvGrad(ScalarT i, ScalarT j, ScalarT k) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return nanovdb::math::Vec3<ScalarT>(1.0, 1.0, 1.0) / mTransform.scale<ScalarT>();
     }
 
-
-
     /// @brief Apply the transformation (from xyz to ijk) to an input point xyz
     /// @tparam ScalarT The scalar type of the input point xyz
     /// @param xyz The input point to apply the transformation to
     /// @return The transformed point T(xyz)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> apply(const nanovdb::math::Vec3<ScalarT>& xyz) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                apply(const nanovdb::math::Vec3<ScalarT> &xyz) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return xyz * mTransform.scale<ScalarT>() + mTransform.translate<ScalarT>();
     }
@@ -92,31 +99,34 @@ struct VoxelCoordTransform {
     /// @param z The z component of the input point to apply the transformation to
     /// @return The transformed point T(x, y, z)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> apply(ScalarT x, ScalarT y, ScalarT z) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                apply(ScalarT x, ScalarT y, ScalarT z) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> xyz(x, y, z);
         return xyz * mTransform.scale<ScalarT>() + mTransform.translate<ScalarT>();
     }
 
-    /// @brief Apply the transformation (from xyz to ijk) to an input point xyz which is of an indexable type
+    /// @brief Apply the transformation (from xyz to ijk) to an input point xyz which is of an
+    /// indexable type
     /// @tparam ScalarT The scalar type of the input point xyz
-    /// @param xyz The input point to apply the transformation to (must support indexing with [0], [1], [2])
+    /// @param xyz The input point to apply the transformation to (must support indexing with [0],
+    /// [1], [2])
     /// @return The transformed point T(xyz)
     template <typename ScalarT, typename InVec3T>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> apply(const InVec3T& xyz) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                apply(const InVec3T &xyz) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> pt(xyz[0], xyz[1], xyz[2]);
         return pt * mTransform.scale<ScalarT>() + mTransform.translate<ScalarT>();
     }
 
-
-
     /// @brief Apply the inverse transformation (from ijk to xyz) to an input coordinate ijk
     /// @tparam ScalarT The scalar type of the input coordinate ijk
     /// @param ijk The input coordinate to apply the inverse transformation to
     /// @return The transformed coordinate T^-1(ijk)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyInv(const nanovdb::math::Vec3<ScalarT>& ijk) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyInv(const nanovdb::math::Vec3<ScalarT> &ijk) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         return (ijk - mTransform.translate<ScalarT>()) / mTransform.scale<ScalarT>();
     }
@@ -128,31 +138,34 @@ struct VoxelCoordTransform {
     /// @param k The k component of the input coordinate to apply the inverse transformation to
     /// @return The transformed coordinate T^-1(i, j, k)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyInv(ScalarT i, ScalarT j, ScalarT k) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyInv(ScalarT i, ScalarT j, ScalarT k) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> ijk(i, j, k);
         return (ijk - mTransform.translate<ScalarT>()) / mTransform.scale<ScalarT>();
     }
 
-    /// @brief Apply the inverse transformation (from ijk to xyz) to an input coordinate ijk which is of an indexable type
+    /// @brief Apply the inverse transformation (from ijk to xyz) to an input coordinate ijk which
+    /// is of an indexable type
     /// @tparam ScalarT The scalar type of the input coordinate ijk
-    /// @param ijk The input coordinate to apply the inverse transformation to (must support indexing with [0], [1], [2])
+    /// @param ijk The input coordinate to apply the inverse transformation to (must support
+    /// indexing with [0], [1], [2])
     /// @return The transformed coordinate T^-1(ijk)
     template <typename ScalarT, typename InVec3T>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> applyInv(const InVec3T& ijk) const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                applyInv(const InVec3T &ijk) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> coord(ijk[0], ijk[1], ijk[2]);
         return (coord - mTransform.translate<ScalarT>()) / mTransform.scale<ScalarT>();
     }
 
-
-
     /// @brief Apply the transformation (from xyz to ijk) to an input ray
     /// @tparam ScalarT The scalar type of the input ray
     /// @param ray The input ray to apply the transformation to
     /// @return The transformed ray T(ray)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Ray<ScalarT> applyToRay(nanovdb::math::Ray<ScalarT> ray) const {
+    __hostdev__ nanovdb::math::Ray<ScalarT>
+                applyToRay(nanovdb::math::Ray<ScalarT> ray) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> oVox = apply<ScalarT>(ray.eye());
         const nanovdb::math::Vec3<ScalarT> dVox = ray.dir() * mTransform.scale<ScalarT>();
@@ -171,13 +184,14 @@ struct VoxelCoordTransform {
     /// @param t1 The maximum ray time parameter
     /// @return The transformed ray T(ray)
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Ray<ScalarT> applyToRay(ScalarT rayOx, ScalarT rayOy, ScalarT rayOz,
-                                                 ScalarT rayDx, ScalarT rayDy, ScalarT rayDz,
-                                                 ScalarT t0 = static_cast<ScalarT>(0),
-                                                 ScalarT t1 = std::numeric_limits<ScalarT>::infinity()) const {
+    __hostdev__ nanovdb::math::Ray<ScalarT>
+    applyToRay(ScalarT rayOx, ScalarT rayOy, ScalarT rayOz, ScalarT rayDx, ScalarT rayDy,
+               ScalarT rayDz, ScalarT t0 = static_cast<ScalarT>(0),
+               ScalarT t1 = std::numeric_limits<ScalarT>::infinity()) const {
         static_assert(is_floating_point_or_half<ScalarT>::value);
         const nanovdb::math::Vec3<ScalarT> oVox = apply<ScalarT>(rayOx, rayOy, rayOz);
-        const nanovdb::math::Vec3<ScalarT> dVox = nanovdb::math::Vec3<ScalarT>(rayDx, rayDy, rayDz) * mTransform.scale<ScalarT>();
+        const nanovdb::math::Vec3<ScalarT> dVox =
+            nanovdb::math::Vec3<ScalarT>(rayDx, rayDy, rayDz) * mTransform.scale<ScalarT>();
         return nanovdb::math::Ray<ScalarT>(oVox, dVox, t0, t1);
     }
 
@@ -185,7 +199,8 @@ struct VoxelCoordTransform {
     /// @tparam ScalarT The scalar type to return the scale in
     /// @return The scale component of this transformation
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> scale() const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                scale() const {
         return mTransform.scale<ScalarT>();
     }
 
@@ -193,105 +208,124 @@ struct VoxelCoordTransform {
     /// @tparam ScalarT The scalar type to return the translation in
     /// @return The translation component of this transformation
     template <typename ScalarT>
-    __hostdev__ nanovdb::math::Vec3<ScalarT> translate() const {
+    __hostdev__ nanovdb::math::Vec3<ScalarT>
+                translate() const {
         return mTransform.translate<ScalarT>();
     }
 
-
-private:
+  private:
     /// @brief A struct representing the transformation from world space (xyz) to voxel space (ijk)
-    ///        in float16, float32, and float64. You can access the scale and translation in any of these
-    ///        by calling methods with the appropriate template paramter
+    ///        in float16, float32, and float64. You can access the scale and translation in any of
+    ///        these by calling methods with the appropriate template paramter
     struct Transform {
         /// @brief Construct an identity transformation
         __hostdev__ Transform() {};
 
         /// @brief Construct a transformation that scales and translates each input point
-        __hostdev__ Transform(nanovdb::Vec3d scale, const nanovdb::Vec3d& translate) :
-            mScaleh(nanovdb::math::Vec3<c10::Half>(c10::Half(float(scale[0])), c10::Half(float(scale[1])), c10::Half(float(scale[2])))),
-            mTranslateh(nanovdb::math::Vec3<c10::Half>(c10::Half(float(translate[0])), c10::Half(float(translate[1])), c10::Half(float(translate[2])))),
-            mScalef(nanovdb::Vec3f(scale[0], scale[1], scale[2])),
-            mTranslatef(nanovdb::Vec3f(translate[0], translate[1], translate[2])),
-            mScaled(scale),
-            mTranslated(translate) {}
-
-        nanovdb::math::Vec3<c10::Half> mScaleh = nanovdb::math::Vec3<c10::Half>(c10::Half(1.0f), c10::Half(1.0f), c10::Half(1.0f));
-        nanovdb::math::Vec3<c10::Half> mTranslateh = nanovdb::math::Vec3<c10::Half>(c10::Half(0.0f), c10::Half(0.0f), c10::Half(0.0f));
-        nanovdb::Vec3f mScalef = nanovdb::Vec3f(1.0f, 1.0f, 1.0f);
+        __hostdev__
+        Transform(nanovdb::Vec3d scale, const nanovdb::Vec3d &translate)
+            : mScaleh(nanovdb::math::Vec3<c10::Half>(c10::Half(float(scale[0])),
+                                                     c10::Half(float(scale[1])),
+                                                     c10::Half(float(scale[2])))),
+              mTranslateh(nanovdb::math::Vec3<c10::Half>(c10::Half(float(translate[0])),
+                                                         c10::Half(float(translate[1])),
+                                                         c10::Half(float(translate[2])))),
+              mScalef(nanovdb::Vec3f(scale[0], scale[1], scale[2])),
+              mTranslatef(nanovdb::Vec3f(translate[0], translate[1], translate[2])), mScaled(scale),
+              mTranslated(translate) {}
+
+        nanovdb::math::Vec3<c10::Half> mScaleh =
+            nanovdb::math::Vec3<c10::Half>(c10::Half(1.0f), c10::Half(1.0f), c10::Half(1.0f));
+        nanovdb::math::Vec3<c10::Half> mTranslateh =
+            nanovdb::math::Vec3<c10::Half>(c10::Half(0.0f), c10::Half(0.0f), c10::Half(0.0f));
+        nanovdb::Vec3f mScalef     = nanovdb::Vec3f(1.0f, 1.0f, 1.0f);
         nanovdb::Vec3f mTranslatef = nanovdb::Vec3f(0.0f, 0.0f, 0.0f);
-        nanovdb::Vec3d mScaled = nanovdb::Vec3d(1.0, 1.0, 1.0);
+        nanovdb::Vec3d mScaled     = nanovdb::Vec3d(1.0, 1.0, 1.0);
         nanovdb::Vec3d mTranslated = nanovdb::Vec3d(0.0, 0.0, 0.0);
 
         /// @brief Get the scale component of this transformation
         /// @tparam T The scalar type to return the scale in
         /// @return The scale component of this transformation
-        template <typename T>
-        __hostdev__ inline const nanovdb::math::Vec3<T>& scale() const;
+        template <typename T> __hostdev__ inline const nanovdb::math::Vec3<T> &scale() const;
 
         /// @brief Get the translation component of this transformation
         /// @tparam T The scalar type to return the translation in
         /// @return The translation component of this transformation
-        template <typename T>
-        __hostdev__ inline const nanovdb::math::Vec3<T>&  translate() const;
+        template <typename T> __hostdev__ inline const nanovdb::math::Vec3<T> &translate() const;
     } mTransform;
 };
 
 // Template specializations to return the appropriate types
 template <>
-__hostdev__ inline const nanovdb::math::Vec3<c10::Half>& VoxelCoordTransform::Transform::scale<c10::Half>() const {
+__hostdev__ inline const nanovdb::math::Vec3<c10::Half> &
+VoxelCoordTransform::Transform::scale<c10::Half>() const {
     return mScaleh;
 }
 template <>
-__hostdev__ inline const nanovdb::Vec3f& VoxelCoordTransform::Transform::scale<float>() const {
+__hostdev__ inline const nanovdb::Vec3f &
+VoxelCoordTransform::Transform::scale<float>() const {
     return mScalef;
 }
 template <>
-__hostdev__ inline const nanovdb::Vec3d& VoxelCoordTransform::Transform::scale<double>() const {
+__hostdev__ inline const nanovdb::Vec3d &
+VoxelCoordTransform::Transform::scale<double>() const {
     return mScaled;
 }
 template <>
-__hostdev__ inline const nanovdb::math::Vec3<c10::Half>& VoxelCoordTransform::Transform::translate<c10::Half>() const {
+__hostdev__ inline const nanovdb::math::Vec3<c10::Half> &
+VoxelCoordTransform::Transform::translate<c10::Half>() const {
     return mTranslateh;
 }
 template <>
-__hostdev__ inline const nanovdb::Vec3f& VoxelCoordTransform::Transform::translate<float>() const {
+__hostdev__ inline const nanovdb::Vec3f &
+VoxelCoordTransform::Transform::translate<float>() const {
     return mTranslatef;
 }
 template <>
-__hostdev__ inline const nanovdb::Vec3d& VoxelCoordTransform::Transform::translate<double>() const {
+__hostdev__ inline const nanovdb::Vec3d &
+VoxelCoordTransform::Transform::translate<double>() const {
     return mTranslated;
 }
 
-/// @brief Get a primal voxel transform given a voxel size and the coordinate of the [0, 0, 0] voxel center
+/// @brief Get a primal voxel transform given a voxel size and the coordinate of the [0, 0, 0] voxel
+/// center
 /// @param voxSize The size of each voxel in the grid
 /// @param voxOrigin The coordinate of the [0, 0, 0] voxel center
 /// @return The primal voxel transform
-inline __hostdev__ VoxelCoordTransform primalVoxelTransformForSizeAndOrigin(const nanovdb::Vec3d& voxSize, const nanovdb::Vec3d& voxOrigin) {
-    // TORCH_CHECK_VALUE(voxSize[0] > 0.0 && voxSize[1] > 0.0 && voxSize[2] > 0.0, "voxel_size must be positive");
-    const nanovdb::Vec3d& w = voxSize;
-    const nanovdb::Vec3d& tx = voxOrigin;
-    const nanovdb::Vec3d invW = nanovdb::Vec3d(1.0, 1.0, 1.0) / w;
-    const nanovdb::Vec3d half(0.5, 0.5, 0.5);
+inline __hostdev__ VoxelCoordTransform
+primalVoxelTransformForSizeAndOrigin(const nanovdb::Vec3d &voxSize,
+                                     const nanovdb::Vec3d &voxOrigin) {
+    // TORCH_CHECK_VALUE(voxSize[0] > 0.0 && voxSize[1] > 0.0 && voxSize[2] > 0.0, "voxel_size must
+    // be positive");
+    const nanovdb::Vec3d &w    = voxSize;
+    const nanovdb::Vec3d &tx   = voxOrigin;
+    const nanovdb::Vec3d  invW = nanovdb::Vec3d(1.0, 1.0, 1.0) / w;
+    const nanovdb::Vec3d  half(0.5, 0.5, 0.5);
 
     return VoxelCoordTransform(invW, -tx / w);
 }
 
-/// @brief Get the primal and dual transforms for a grid given a voxel size and the coordinate of the [0, 0, 0] voxel center
+/// @brief Get the primal and dual transforms for a grid given a voxel size and the coordinate of
+/// the [0, 0, 0] voxel center
 /// @param voxSize The size of each voxel in the grid
 /// @param voxOrigin The coordinate of the [0, 0, 0] voxel center
 /// @param outPrimal Output primal transform
 /// @param outDual Output dual transform
-inline __hostdev__ void voxelTransformForSizeAndOrigin(const nanovdb::Vec3d& voxSize, const nanovdb::Vec3d& voxOrigin,
-                                                       VoxelCoordTransform& outPrimal, VoxelCoordTransform& outDual) {
-    // TORCH_CHECK_VALUE(voxSize[0] > 0.0 && voxSize[1] > 0.0 && voxSize[2] > 0.0, "voxel_size must be positive");
-    const nanovdb::Vec3d& w = voxSize;
-    const nanovdb::Vec3d& tx = voxOrigin;
-    const nanovdb::Vec3d invW = nanovdb::Vec3d(1.0, 1.0, 1.0) / w;
-    const nanovdb::Vec3d half(0.5, 0.5, 0.5);
+inline __hostdev__ void
+voxelTransformForSizeAndOrigin(const nanovdb::Vec3d &voxSize, const nanovdb::Vec3d &voxOrigin,
+                               VoxelCoordTransform &outPrimal, VoxelCoordTransform &outDual) {
+    // TORCH_CHECK_VALUE(voxSize[0] > 0.0 && voxSize[1] > 0.0 && voxSize[2] > 0.0, "voxel_size must
+    // be positive");
+    const nanovdb::Vec3d &w    = voxSize;
+    const nanovdb::Vec3d &tx   = voxOrigin;
+    const nanovdb::Vec3d  invW = nanovdb::Vec3d(1.0, 1.0, 1.0) / w;
+    const nanovdb::Vec3d  half(0.5, 0.5, 0.5);
 
     outPrimal = VoxelCoordTransform(invW, -tx / w);
-    outDual = VoxelCoordTransform(invW, -tx / w + half);
+    outDual   = VoxelCoordTransform(invW, -tx / w + half);
 }
 
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_VOXELCOORDTRANSFORM_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/Attention.cpp b/fvdb/src/detail/autograd/Attention.cpp
index 6f3f99ee77..a5dc8813d3 100644
--- a/fvdb/src/detail/autograd/Attention.cpp
+++ b/fvdb/src/detail/autograd/Attention.cpp
@@ -3,23 +3,21 @@
 //
 #include "Attention.h"
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-Attention::variable_list Attention::forward(Attention::AutogradContext *ctx,
-                                            const Attention::Variable& query,
-                                            const Attention::Variable& key,
-                                            const Attention::Variable& value,
-                                            const Attention::Variable& qLengths,
-                                            const Attention::Variable& kvLengths,
-                                            float scale) {
+Attention::variable_list
+Attention::forward(Attention::AutogradContext *ctx, const Attention::Variable &query,
+                   const Attention::Variable &key, const Attention::Variable &value,
+                   const Attention::Variable &qLengths, const Attention::Variable &kvLengths,
+                   float scale) {
     torch::Tensor out = FVDB_DISPATCH_KERNEL_DEVICE(query.device(), [&]() {
-        return ops::dispatchScaledDotProductAttention<DeviceTag>(
-            query, key, value, qLengths, kvLengths, true, scale);
+        return ops::dispatchScaledDotProductAttention<DeviceTag>(query, key, value, qLengths,
+                                                                 kvLengths, true, scale);
     });
 
     // ctx->saved_data["tsmtThreshold"] = tsmtThreshold;
@@ -29,15 +27,14 @@ Attention::variable_list Attention::forward(Attention::AutogradContext *ctx,
     //     outOpacity, outDepth, outRgb, outWs
     // });
 
-    return { out};
+    return { out };
 }
 
-Attention::variable_list Attention::backward(Attention::AutogradContext *ctx,
-                                                   Attention::variable_list grad_output) {
+Attention::variable_list
+Attention::backward(Attention::AutogradContext *ctx, Attention::variable_list grad_output) {
     TORCH_CHECK(false, "Not implemented");
 }
 
-
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/autograd/Attention.h b/fvdb/src/detail/autograd/Attention.h
index 050ed643e3..9ee04a49ca 100644
--- a/fvdb/src/detail/autograd/Attention.h
+++ b/fvdb/src/detail/autograd/Attention.h
@@ -1,33 +1,29 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_ATTENTION_H
+#define FVDB_DETAIL_AUTOGRAD_ATTENTION_H
 
 #include <torch/autograd.h>
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-struct Attention : public torch::autograd::Function<Attention>
-{
-    using variable_list = torch::autograd::variable_list;
+struct Attention : public torch::autograd::Function<Attention> {
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 const Variable& query,
-                                 const Variable& key,
-                                 const Variable& value,
-                                 const Variable& qLengths,
-                                 const Variable& kvLengths,
-                                 float scale);
+    static variable_list forward(AutogradContext *ctx, const Variable &query, const Variable &key,
+                                 const Variable &value, const Variable &qLengths,
+                                 const Variable &kvLengths, float scale);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_ATTENTION_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/Autograd.h b/fvdb/src/detail/autograd/Autograd.h
index 10873c9255..c63be9fac8 100644
--- a/fvdb/src/detail/autograd/Autograd.h
+++ b/fvdb/src/detail/autograd/Autograd.h
@@ -1,18 +1,23 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include "MaxPoolGrid.h"
+#ifndef FVDB_DETAIL_AUTOGRAD_AUTOGRAD_H
+#define FVDB_DETAIL_AUTOGRAD_AUTOGRAD_H
+
+#include "Attention.h"
 #include "AvgPoolGrid.h"
+#include "FillToGrid.h"
+#include "JaggedReduce.h"
+#include "MaxPoolGrid.h"
+#include "ReadFromDense.h"
+#include "ReadIntoDense.h"
 #include "SampleGrid.h"
+#include "SparseConvolutionHalo.h"
+#include "SparseConvolutionImplicitGEMM.h"
+#include "SparseConvolutionKernelMap.h"
 #include "SplatIntoGrid.h"
-#include "UpsampleGrid.h"
 #include "TransformPoints.h"
+#include "UpsampleGrid.h"
 #include "VolumeRender.h"
-#include "SparseConvolutionKernelMap.h"
-#include "SparseConvolutionHalo.h"
-#include "SparseConvolutionImplicitGEMM.h"
-#include "ReadIntoDense.h"
-#include "ReadFromDense.h"
-#include "FillToGrid.h"
-#include "JaggedReduce.h"
-#include "Attention.h"
\ No newline at end of file
+
+#endif // FVDB_DETAIL_AUTOGRAD_AUTOGRAD_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/AvgPoolGrid.cpp b/fvdb/src/detail/autograd/AvgPoolGrid.cpp
index 0078deb660..cf76ad295e 100644
--- a/fvdb/src/detail/autograd/AvgPoolGrid.cpp
+++ b/fvdb/src/detail/autograd/AvgPoolGrid.cpp
@@ -3,70 +3,60 @@
 //
 #include "AvgPoolGrid.h"
 
-#include <nanovdb/NanoVDB.h>
-
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
+#include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-AvgPoolGrid::variable_list AvgPoolGrid::forward(AvgPoolGrid::AutogradContext *ctx,
-                                                c10::intrusive_ptr<GridBatchImpl> fineGrid,
-                                                c10::intrusive_ptr<GridBatchImpl> coarseGrid,
-                                                nanovdb::Coord poolingFactor,
-                                                nanovdb::Coord stride,
-                                                AvgPoolGrid::Variable fineData) {
-
+AvgPoolGrid::variable_list
+AvgPoolGrid::forward(AvgPoolGrid::AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> fineGrid,
+                     c10::intrusive_ptr<GridBatchImpl> coarseGrid, nanovdb::Coord poolingFactor,
+                     nanovdb::Coord stride, AvgPoolGrid::Variable fineData) {
     torch::Tensor outCoarseData = FVDB_DISPATCH_KERNEL_DEVICE(fineData.device(), [&]() {
-        return ops::dispatchDownsampleGridAvgPool<DeviceTag>(
-            *fineGrid, *coarseGrid, fineData, poolingFactor, stride);
+        return ops::dispatchDownsampleGridAvgPool<DeviceTag>(*fineGrid, *coarseGrid, fineData,
+                                                             poolingFactor, stride);
     });
 
-    ctx->save_for_backward({fineData});
-    ctx->saved_data["fine_grid"] = fineGrid;
-    ctx->saved_data["coarse_grid"] = coarseGrid;
-    ctx->saved_data["pooling_factor_x"] = (int64_t) poolingFactor[0];
-    ctx->saved_data["pooling_factor_y"] = (int64_t) poolingFactor[1];
-    ctx->saved_data["pooling_factor_z"] = (int64_t) poolingFactor[2];
-    ctx->saved_data["stride_x"] = (int64_t) stride[0];
-    ctx->saved_data["stride_y"] = (int64_t) stride[1];
-    ctx->saved_data["stride_z"] = (int64_t) stride[2];
+    ctx->save_for_backward({ fineData });
+    ctx->saved_data["fine_grid"]        = fineGrid;
+    ctx->saved_data["coarse_grid"]      = coarseGrid;
+    ctx->saved_data["pooling_factor_x"] = (int64_t)poolingFactor[0];
+    ctx->saved_data["pooling_factor_y"] = (int64_t)poolingFactor[1];
+    ctx->saved_data["pooling_factor_z"] = (int64_t)poolingFactor[2];
+    ctx->saved_data["stride_x"]         = (int64_t)stride[0];
+    ctx->saved_data["stride_y"]         = (int64_t)stride[1];
+    ctx->saved_data["stride_z"]         = (int64_t)stride[2];
 
-    return variable_list({outCoarseData});
+    return variable_list({ outCoarseData });
 }
 
-AvgPoolGrid::variable_list AvgPoolGrid::backward(AvgPoolGrid::AutogradContext *ctx,
-                                                 AvgPoolGrid::variable_list grad_output) {
-
+AvgPoolGrid::variable_list
+AvgPoolGrid::backward(AvgPoolGrid::AutogradContext *ctx, AvgPoolGrid::variable_list grad_output) {
     // Use data saved in forward
-    variable_list saved = ctx->get_saved_variables();
-    Variable fineData = saved.at(0);
-    auto fineGrid = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
-    auto coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
-    const int64_t poolingFactorX = ctx->saved_data["pooling_factor_x"].toInt();
-    const int64_t poolingFactorY = ctx->saved_data["pooling_factor_y"].toInt();
-    const int64_t poolingFactorZ = ctx->saved_data["pooling_factor_z"].toInt();
-    const int64_t strideX = ctx->saved_data["stride_x"].toInt();
-    const int64_t strideY = ctx->saved_data["stride_y"].toInt();
-    const int64_t strideZ = ctx->saved_data["stride_z"].toInt();
+    variable_list        saved      = ctx->get_saved_variables();
+    Variable             fineData   = saved.at(0);
+    auto                 fineGrid   = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
+    auto                 coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
+    const int64_t        poolingFactorX = ctx->saved_data["pooling_factor_x"].toInt();
+    const int64_t        poolingFactorY = ctx->saved_data["pooling_factor_y"].toInt();
+    const int64_t        poolingFactorZ = ctx->saved_data["pooling_factor_z"].toInt();
+    const int64_t        strideX        = ctx->saved_data["stride_x"].toInt();
+    const int64_t        strideY        = ctx->saved_data["stride_y"].toInt();
+    const int64_t        strideZ        = ctx->saved_data["stride_z"].toInt();
     const nanovdb::Coord poolingFactor(poolingFactorX, poolingFactorY, poolingFactorZ);
     const nanovdb::Coord stride(strideX, strideY, strideZ);
-    Variable gradOut = grad_output.at(0).contiguous();  // [#coarse_voxels | #coarse_corners, *]
+    Variable gradOut = grad_output.at(0).contiguous(); // [#coarse_voxels | #coarse_corners, *]
 
     Variable outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         return ops::dispatchDownsampleGridAvgPoolBackward<DeviceTag>(
-            *coarseGrid, *fineGrid,
-            fineData,
-            gradOut,
-            poolingFactor,
-            stride
-        );
+            *coarseGrid, *fineGrid, fineData, gradOut, poolingFactor, stride);
     });
 
-    return {torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn};
+    return { torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn };
 }
 
 } // namespace autograd
diff --git a/fvdb/src/detail/autograd/AvgPoolGrid.h b/fvdb/src/detail/autograd/AvgPoolGrid.h
index ec6211d7be..b8d6247baa 100644
--- a/fvdb/src/detail/autograd/AvgPoolGrid.h
+++ b/fvdb/src/detail/autograd/AvgPoolGrid.h
@@ -1,33 +1,32 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
-#include <torch/autograd.h>
+#ifndef FVDB_DETAIL_AUTOGRAD_AVGPOOLGRID_H
+#define FVDB_DETAIL_AUTOGRAD_AVGPOOLGRID_H
 
 #include "detail/GridBatchImpl.h"
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct AvgPoolGrid : public torch::autograd::Function<AvgPoolGrid> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> fineGrid,
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> fineGrid,
                                  c10::intrusive_ptr<GridBatchImpl> coarseGrid,
-                                 nanovdb::Coord poolingFactor,
-                                 nanovdb::Coord stride,
+                                 nanovdb::Coord poolingFactor, nanovdb::Coord stride,
                                  Variable fineData);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_AVGPOOLGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/FillToGrid.h b/fvdb/src/detail/autograd/FillToGrid.h
index c1a43aa652..7af2a74730 100644
--- a/fvdb/src/detail/autograd/FillToGrid.h
+++ b/fvdb/src/detail/autograd/FillToGrid.h
@@ -1,78 +1,81 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_FILLTOGRID_H
+#define FVDB_DETAIL_AUTOGRAD_FILLTOGRID_H
 
-#include <vector>
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 
 #include <torch/autograd.h>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
-
-#include "detail/GridBatchImpl.h"
-#include "Types.h"
-
+#include <vector>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct FillToGrid : public torch::autograd::Function<FillToGrid> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> fromGrid,
-                                 c10::intrusive_ptr<GridBatchImpl> toGrid,
-                                 Variable fromFeatures,
-                                 const int default_value=0.0) {
-        TORCH_CHECK_VALUE(fromFeatures.size(0) == fromGrid->totalVoxels(), "fromFeatures must conform to fromGrid");
-        TORCH_CHECK_VALUE(fromGrid->batchSize() == toGrid->batchSize(), "fromGrid and toGrid must have the same batch size");
+    static variable_list
+    forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> fromGrid,
+            c10::intrusive_ptr<GridBatchImpl> toGrid, Variable fromFeatures,
+            const int default_value = 0.0) {
+        TORCH_CHECK_VALUE(fromFeatures.size(0) == fromGrid->totalVoxels(),
+                          "fromFeatures must conform to fromGrid");
+        TORCH_CHECK_VALUE(fromGrid->batchSize() == toGrid->batchSize(),
+                          "fromGrid and toGrid must have the same batch size");
 
         torch::Tensor fromFeaturesReshape = featureCoalescedView(fromFeatures);
-        torch::Tensor ret = torch::full({toGrid->totalVoxels(), fromFeaturesReshape.size(1)},
+        torch::Tensor ret = torch::full({ toGrid->totalVoxels(), fromFeaturesReshape.size(1) },
                                         default_value, fromFeaturesReshape.options());
-        auto outShape = spliceShape({toGrid->totalVoxels()}, fromFeatures, 1);  // [B*M, *]
+        auto outShape     = spliceShape({ toGrid->totalVoxels() }, fromFeatures, 1); // [B*M, *]
 
         // Dispatch to kernel.
         FVDB_DISPATCH_KERNEL_DEVICE(fromGrid->device(), [&]() {
-            ops::dispatchFillToGrid<DeviceTag>(
-                *fromGrid, *toGrid, fromFeaturesReshape, ret);
+            ops::dispatchFillToGrid<DeviceTag>(*fromGrid, *toGrid, fromFeaturesReshape, ret);
         });
 
         ctx->saved_data["from_grid"] = fromGrid;
-        ctx->saved_data["to_grid"] = toGrid;
+        ctx->saved_data["to_grid"]   = toGrid;
 
-        return variable_list({ret.reshape(outShape)});
+        return variable_list({ ret.reshape(outShape) });
     }
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output) {
-        torch::Tensor gradFeatures = grad_output[0];
+    static variable_list
+    backward(AutogradContext *ctx, variable_list grad_output) {
+        torch::Tensor gradFeatures        = grad_output[0];
         torch::Tensor gradFeaturesReshape = featureCoalescedView(gradFeatures);
 
         auto fromGrid = ctx->saved_data["from_grid"].toCustomClass<GridBatchImpl>();
-        auto toGrid = ctx->saved_data["to_grid"].toCustomClass<GridBatchImpl>();
-        auto outShape = spliceShape({fromGrid->totalVoxels()}, gradFeatures, 1);  // [B*M, *]
+        auto toGrid   = ctx->saved_data["to_grid"].toCustomClass<GridBatchImpl>();
+        auto outShape = spliceShape({ fromGrid->totalVoxels() }, gradFeatures, 1); // [B*M, *]
 
-        // The default grad_input is always 0.0, since gradient will only propagate for overlapped voxels.
-        torch::Tensor gradInput = torch::zeros({fromGrid->totalVoxels(), gradFeaturesReshape.size(1)},
-                                               gradFeaturesReshape.options());
+        // The default grad_input is always 0.0, since gradient will only propagate for overlapped
+        // voxels.
+        torch::Tensor gradInput =
+            torch::zeros({ fromGrid->totalVoxels(), gradFeaturesReshape.size(1) },
+                         gradFeaturesReshape.options());
 
         // Dispatch same kernel but with to and from switched.
         FVDB_DISPATCH_KERNEL_DEVICE(fromGrid->device(), [&]() {
-            ops::dispatchFillToGrid<DeviceTag>(
-                *toGrid, *fromGrid, gradFeaturesReshape, gradInput);
+            ops::dispatchFillToGrid<DeviceTag>(*toGrid, *fromGrid, gradFeaturesReshape, gradInput);
         });
 
-        return variable_list({torch::Tensor(), torch::Tensor(), gradInput.reshape(outShape), torch::Tensor()});
+        return variable_list(
+            { torch::Tensor(), torch::Tensor(), gradInput.reshape(outShape), torch::Tensor() });
     }
 };
 
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_FILLTOGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/JaggedReduce.cpp b/fvdb/src/detail/autograd/JaggedReduce.cpp
index f2c0e99451..37f9f09b8c 100644
--- a/fvdb/src/detail/autograd/JaggedReduce.cpp
+++ b/fvdb/src/detail/autograd/JaggedReduce.cpp
@@ -3,17 +3,17 @@
 //
 #include "JaggedReduce.h"
 
-#include <nanovdb/NanoVDB.h>
-
-#include "detail/ops/jagged/JaggedOps.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/jagged/JaggedOps.h>
+#include <detail/utils/Utils.h>
 
+#include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-static inline std::vector<int64_t> list2vec(const c10::List<int64_t> list) {
+static inline std::vector<int64_t>
+list2vec(const c10::List<int64_t> list) {
     std::vector<int64_t> result;
     result.reserve(list.size());
     for (size_t i = 0; i < list.size(); i++)
@@ -21,109 +21,107 @@ static inline std::vector<int64_t> list2vec(const c10::List<int64_t> list) {
     return result;
 }
 
-JaggedSum::variable_list JaggedSum::forward(JaggedSum::AutogradContext *ctx,
-                                            JaggedSum::Variable jdata,
-                                            JaggedSum::Variable jidx,
-                                            JaggedSum::Variable joffsets,
-                                            int64_t dim_size) {
+JaggedSum::variable_list
+JaggedSum::forward(JaggedSum::AutogradContext *ctx, JaggedSum::Variable jdata,
+                   JaggedSum::Variable jidx, JaggedSum::Variable joffsets, int64_t dim_size) {
     TORCH_CHECK_VALUE(jdata.device() == jidx.device(), "jdata and jidx must be on the same device");
-    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(), "jdata and joffsets must be on the same device");
+    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(),
+                      "jdata and joffsets must be on the same device");
 
     torch::Tensor outData = FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
         return ops::dispatchJaggedSum<DeviceTag>(jdata, jidx, joffsets, dim_size);
     });
 
-    ctx->save_for_backward({jidx});
-    return variable_list({outData});
+    ctx->save_for_backward({ jidx });
+    return variable_list({ outData });
 }
 
-JaggedSum::variable_list JaggedSum::backward(JaggedSum::AutogradContext *ctx,
-                                             JaggedSum::variable_list grad_output) {
-    variable_list saved = ctx->get_saved_variables();
-    Variable jidx = saved.at(0);
-    Variable gradIn = grad_output.at(0).index({jidx.to(torch::kInt32)});
-    return {gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor()};
+JaggedSum::variable_list
+JaggedSum::backward(JaggedSum::AutogradContext *ctx, JaggedSum::variable_list grad_output) {
+    variable_list saved  = ctx->get_saved_variables();
+    Variable      jidx   = saved.at(0);
+    Variable      gradIn = grad_output.at(0).index({ jidx.to(torch::kInt32) });
+    return { gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor() };
 }
 
-JaggedMin::variable_list JaggedMin::forward(JaggedMin::AutogradContext *ctx,
-                                            JaggedMin::Variable jdata,
-                                            JaggedMin::Variable jidx,
-                                            JaggedMin::Variable joffsets,
-                                            int64_t dim_size) {
+JaggedMin::variable_list
+JaggedMin::forward(JaggedMin::AutogradContext *ctx, JaggedMin::Variable jdata,
+                   JaggedMin::Variable jidx, JaggedMin::Variable joffsets, int64_t dim_size) {
     TORCH_CHECK_VALUE(jdata.device() == jidx.device(), "jdata and jidx must be on the same device");
-    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(), "jdata and joffsets must be on the same device");
+    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(),
+                      "jdata and joffsets must be on the same device");
 
-    auto minOut = FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
+    auto          minOut  = FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
         return ops::dispatchJaggedMin<DeviceTag>(jdata, jidx, joffsets, dim_size);
     });
     torch::Tensor minData = minOut[0];
-    torch::Tensor minIdx = minOut[1];
-    ctx->save_for_backward({minIdx, joffsets});
+    torch::Tensor minIdx  = minOut[1];
+    ctx->save_for_backward({ minIdx, joffsets });
     ctx->saved_data["src_shape"] = jdata.sizes();
-    return variable_list({minData, minIdx});
+    return variable_list({ minData, minIdx });
 }
 
-JaggedMin::variable_list JaggedMin::backward(JaggedMin::AutogradContext *ctx,
-                                             JaggedMin::variable_list grad_output) {
-    variable_list saved = ctx->get_saved_variables();
-    Variable gradOut = grad_output.at(0);
-    Variable minIdx = saved.at(0);
-    Variable joffsets0 = saved.at(1).index({torch::indexing::Slice(0, -1)});
+JaggedMin::variable_list
+JaggedMin::backward(JaggedMin::AutogradContext *ctx, JaggedMin::variable_list grad_output) {
+    variable_list saved     = ctx->get_saved_variables();
+    Variable      gradOut   = grad_output.at(0);
+    Variable      minIdx    = saved.at(0);
+    Variable      joffsets0 = saved.at(1).index({ torch::indexing::Slice(0, -1) });
     for (int i = 0; i < minIdx.dim() - 1; i += 1) {
         joffsets0 = joffsets0.unsqueeze(-1);
     }
 
     auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
 
-    // For output that receives no input, propagate to position -1 will result in memory out-of-bound error.
+    // For output that receives no input, propagate to position -1 will result in memory
+    // out-of-bound error.
     //  Therefore, we need to add a dummy zero at the beginning of the index tensor.
     // src_shape[0] += 1;
     Variable gradIn = torch::zeros(src_shape, gradOut.options());
     gradIn.scatter_(0, minIdx + joffsets0, gradOut);
     // gradIn = gradIn.narrow(0, 1, src_shape[0] - 1);
-    return {gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor()};
+    return { gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor() };
 }
 
-JaggedMax::variable_list JaggedMax::forward(JaggedMax::AutogradContext *ctx,
-                                            JaggedMax::Variable jdata,
-                                            JaggedMax::Variable jidx,
-                                            JaggedMax::Variable joffsets,
-                                            int64_t dim_size) {
+JaggedMax::variable_list
+JaggedMax::forward(JaggedMax::AutogradContext *ctx, JaggedMax::Variable jdata,
+                   JaggedMax::Variable jidx, JaggedMax::Variable joffsets, int64_t dim_size) {
     TORCH_CHECK_VALUE(jdata.device() == jidx.device(), "jdata and jidx must be on the same device");
-    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(), "jdata and joffsets must be on the same device");
+    TORCH_CHECK_VALUE(jdata.device() == joffsets.device(),
+                      "jdata and joffsets must be on the same device");
 
-    auto maxOut = FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
+    auto          maxOut  = FVDB_DISPATCH_KERNEL_DEVICE(jdata.device(), [&]() {
         return ops::dispatchJaggedMax<DeviceTag>(jdata, jidx, joffsets, dim_size);
     });
     torch::Tensor maxData = maxOut[0];
-    torch::Tensor maxIdx = maxOut[1];
+    torch::Tensor maxIdx  = maxOut[1];
 
-    ctx->save_for_backward({maxIdx, joffsets});
+    ctx->save_for_backward({ maxIdx, joffsets });
     ctx->saved_data["src_shape"] = jdata.sizes();
-    return variable_list({maxData, maxIdx});
+    return variable_list({ maxData, maxIdx });
 }
 
-JaggedMax::variable_list JaggedMax::backward(JaggedMax::AutogradContext *ctx,
-                                             JaggedMax::variable_list grad_output) {
-    variable_list saved = ctx->get_saved_variables();
-    Variable gradOut = grad_output.at(0);
-    Variable maxIdx = saved.at(0);
-    Variable joffsets0 = saved.at(1).index({torch::indexing::Slice(0, -1)});
+JaggedMax::variable_list
+JaggedMax::backward(JaggedMax::AutogradContext *ctx, JaggedMax::variable_list grad_output) {
+    variable_list saved     = ctx->get_saved_variables();
+    Variable      gradOut   = grad_output.at(0);
+    Variable      maxIdx    = saved.at(0);
+    Variable      joffsets0 = saved.at(1).index({ torch::indexing::Slice(0, -1) });
     for (int i = 0; i < maxIdx.dim() - 1; i += 1) {
         joffsets0 = joffsets0.unsqueeze(-1);
     }
     auto src_shape = list2vec(ctx->saved_data["src_shape"].toIntList());
 
-    // For output that receives no input, propagate to position -1 will result in memory out-of-bound error.
+    // For output that receives no input, propagate to position -1 will result in memory
+    // out-of-bound error.
     //  Therefore, we need to add a dummy zero at the beginning of the index tensor.
     // src_shape[0] += 1;
     Variable gradIn = torch::zeros(src_shape, gradOut.options());
     gradIn.scatter_(0, maxIdx + joffsets0, gradOut);
     // gradIn = gradIn.narrow(0, 1, src_shape[0] - 1);
-    return {gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor()};
+    return { gradIn, torch::Tensor(), torch::Tensor(), torch::Tensor() };
 }
 
-
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/JaggedReduce.h b/fvdb/src/detail/autograd/JaggedReduce.h
index 151a2cda81..e9e3c24107 100644
--- a/fvdb/src/detail/autograd/JaggedReduce.h
+++ b/fvdb/src/detail/autograd/JaggedReduce.h
@@ -1,56 +1,52 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
-#include <torch/autograd.h>
+#ifndef FVDB_DETAIL_AUTOGRAD_JAGGEDREDUCE_H
+#define FVDB_DETAIL_AUTOGRAD_JAGGEDREDUCE_H
 
 #include "detail/GridBatchImpl.h"
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct JaggedSum : public torch::autograd::Function<JaggedSum> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 Variable jdata, Variable jidx,
+    static variable_list forward(AutogradContext *ctx, Variable jdata, Variable jidx,
                                  Variable joffsets, int64_t dim_size);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 struct JaggedMin : public torch::autograd::Function<JaggedMin> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 Variable jdata, Variable jidx,
+    static variable_list forward(AutogradContext *ctx, Variable jdata, Variable jidx,
                                  Variable joffsets, int64_t dim_size);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 struct JaggedMax : public torch::autograd::Function<JaggedMax> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 Variable jdata, Variable jidx,
+    static variable_list forward(AutogradContext *ctx, Variable jdata, Variable jidx,
                                  Variable joffsets, int64_t dim_size);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_JAGGEDREDUCE_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/MaxPoolGrid.cpp b/fvdb/src/detail/autograd/MaxPoolGrid.cpp
index f2f9c51a5f..38ef2c8218 100644
--- a/fvdb/src/detail/autograd/MaxPoolGrid.cpp
+++ b/fvdb/src/detail/autograd/MaxPoolGrid.cpp
@@ -3,71 +3,61 @@
 //
 #include "MaxPoolGrid.h"
 
-#include <nanovdb/NanoVDB.h>
-
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
+#include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-MaxPoolGrid::variable_list MaxPoolGrid::forward(MaxPoolGrid::AutogradContext *ctx,
-                                                c10::intrusive_ptr<GridBatchImpl> fineGrid,
-                                                c10::intrusive_ptr<GridBatchImpl> coarseGrid,
-                                                nanovdb::Coord poolingFactor,
-                                                nanovdb::Coord stride,
-                                                MaxPoolGrid::Variable fineData) {
-
+MaxPoolGrid::variable_list
+MaxPoolGrid::forward(MaxPoolGrid::AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> fineGrid,
+                     c10::intrusive_ptr<GridBatchImpl> coarseGrid, nanovdb::Coord poolingFactor,
+                     nanovdb::Coord stride, MaxPoolGrid::Variable fineData) {
     torch::Tensor outCoarseData = FVDB_DISPATCH_KERNEL_DEVICE(fineData.device(), [&]() {
-        return ops::dispatchDownsampleGridMaxPool<DeviceTag>(
-            *fineGrid, *coarseGrid, fineData, poolingFactor, stride);
+        return ops::dispatchDownsampleGridMaxPool<DeviceTag>(*fineGrid, *coarseGrid, fineData,
+                                                             poolingFactor, stride);
     });
 
-    ctx->save_for_backward({fineData});
-    ctx->saved_data["fine_grid"] = fineGrid;
-    ctx->saved_data["coarse_grid"] = coarseGrid;
-    ctx->saved_data["pooling_factor_x"] = (int64_t) poolingFactor[0];
-    ctx->saved_data["pooling_factor_y"] = (int64_t) poolingFactor[1];
-    ctx->saved_data["pooling_factor_z"] = (int64_t) poolingFactor[2];
-    ctx->saved_data["stride_x"] = (int64_t) stride[0];
-    ctx->saved_data["stride_y"] = (int64_t) stride[1];
-    ctx->saved_data["stride_z"] = (int64_t) stride[2];
+    ctx->save_for_backward({ fineData });
+    ctx->saved_data["fine_grid"]        = fineGrid;
+    ctx->saved_data["coarse_grid"]      = coarseGrid;
+    ctx->saved_data["pooling_factor_x"] = (int64_t)poolingFactor[0];
+    ctx->saved_data["pooling_factor_y"] = (int64_t)poolingFactor[1];
+    ctx->saved_data["pooling_factor_z"] = (int64_t)poolingFactor[2];
+    ctx->saved_data["stride_x"]         = (int64_t)stride[0];
+    ctx->saved_data["stride_y"]         = (int64_t)stride[1];
+    ctx->saved_data["stride_z"]         = (int64_t)stride[2];
 
-    return variable_list({outCoarseData});
+    return variable_list({ outCoarseData });
 }
 
-MaxPoolGrid::variable_list MaxPoolGrid::backward(MaxPoolGrid::AutogradContext *ctx,
-                                                 MaxPoolGrid::variable_list grad_output) {
-
+MaxPoolGrid::variable_list
+MaxPoolGrid::backward(MaxPoolGrid::AutogradContext *ctx, MaxPoolGrid::variable_list grad_output) {
     // Use data saved in forward
-    variable_list saved = ctx->get_saved_variables();
-    Variable fineData = saved.at(0);
-    auto fineGrid = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
-    auto coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
-    const int64_t poolingFactorX = ctx->saved_data["pooling_factor_x"].toInt();
-    const int64_t poolingFactorY = ctx->saved_data["pooling_factor_y"].toInt();
-    const int64_t poolingFactorZ = ctx->saved_data["pooling_factor_z"].toInt();
-    const int64_t strideX = ctx->saved_data["stride_x"].toInt();
-    const int64_t strideY = ctx->saved_data["stride_y"].toInt();
-    const int64_t strideZ = ctx->saved_data["stride_z"].toInt();
+    variable_list        saved      = ctx->get_saved_variables();
+    Variable             fineData   = saved.at(0);
+    auto                 fineGrid   = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
+    auto                 coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
+    const int64_t        poolingFactorX = ctx->saved_data["pooling_factor_x"].toInt();
+    const int64_t        poolingFactorY = ctx->saved_data["pooling_factor_y"].toInt();
+    const int64_t        poolingFactorZ = ctx->saved_data["pooling_factor_z"].toInt();
+    const int64_t        strideX        = ctx->saved_data["stride_x"].toInt();
+    const int64_t        strideY        = ctx->saved_data["stride_y"].toInt();
+    const int64_t        strideZ        = ctx->saved_data["stride_z"].toInt();
     const nanovdb::Coord poolingFactor(poolingFactorX, poolingFactorY, poolingFactorZ);
     const nanovdb::Coord stride(strideX, strideY, strideZ);
 
-    Variable gradOut = grad_output.at(0).contiguous();  // [#coarse_voxels | #coarse_corners, *]
+    Variable gradOut = grad_output.at(0).contiguous(); // [#coarse_voxels | #coarse_corners, *]
 
     Variable outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         return ops::dispatchDownsampleGridMaxPoolBackward<DeviceTag>(
-            *coarseGrid, *fineGrid,
-            fineData,
-            gradOut,
-            poolingFactor,
-            stride
-        );
+            *coarseGrid, *fineGrid, fineData, gradOut, poolingFactor, stride);
     });
 
-    return {torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn};
+    return { torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn };
 }
 
 } // namespace autograd
diff --git a/fvdb/src/detail/autograd/MaxPoolGrid.h b/fvdb/src/detail/autograd/MaxPoolGrid.h
index 51bd617a90..4f02314c89 100644
--- a/fvdb/src/detail/autograd/MaxPoolGrid.h
+++ b/fvdb/src/detail/autograd/MaxPoolGrid.h
@@ -1,33 +1,32 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_MAXPOOLGRID_H
+#define FVDB_DETAIL_AUTOGRAD_MAXPOOLGRID_H
 
-#include <torch/autograd.h>
-
-#include "detail/GridBatchImpl.h"
+#include <detail/GridBatchImpl.h>
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct MaxPoolGrid : public torch::autograd::Function<MaxPoolGrid> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> fineGrid,
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> fineGrid,
                                  c10::intrusive_ptr<GridBatchImpl> coarseGrid,
-                                 nanovdb::Coord poolingFactor,
-                                 nanovdb::Coord stride,
+                                 nanovdb::Coord poolingFactor, nanovdb::Coord stride,
                                  Variable fineData);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_MAXPOOLGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/ReadFromDense.h b/fvdb/src/detail/autograd/ReadFromDense.h
index a4a8f81217..101fcceb6a 100644
--- a/fvdb/src/detail/autograd/ReadFromDense.h
+++ b/fvdb/src/detail/autograd/ReadFromDense.h
@@ -1,36 +1,35 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_READFROMDENSE_H
+#define FVDB_DETAIL_AUTOGRAD_READFROMDENSE_H
 
-#include <vector>
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 
 #include <torch/autograd.h>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
-
-#include "detail/GridBatchImpl.h"
-#include "Types.h"
-
+#include <vector>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct ReadFromDense : public torch::autograd::Function<ReadFromDense> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 Variable denseData,
-                                 const Vec3iBatch& denseOrigins) {
+    static variable_list
+    forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid, Variable denseData,
+            const Vec3iBatch &denseOrigins) {
         TORCH_CHECK_VALUE(denseData.dim() > 4, "dense data must have shape [B, W, H, D, *]");
-        TORCH_CHECK_VALUE(denseData.size(0) == grid->batchSize(), "dense data must have shape [B, W, H, D, *]");
+        TORCH_CHECK_VALUE(denseData.size(0) == grid->batchSize(),
+                          "dense data must have shape [B, W, H, D, *]");
         TORCH_CHECK_VALUE(denseData.is_contiguous(), "sparse_data must be contiguous");
         grid->checkDevice(denseData);
 
@@ -41,49 +40,55 @@ struct ReadFromDense : public torch::autograd::Function<ReadFromDense> {
         torch::Tensor denseDataReshape = featureCoalescedView(denseData, 4);
 
         // [N, -1]
-        torch::Tensor ret = torch::zeros({grid->totalVoxels(), denseDataReshape.size(4)}, denseData.options());
+        torch::Tensor ret =
+            torch::zeros({ grid->totalVoxels(), denseDataReshape.size(4) }, denseData.options());
 
         // nanovdb::Coord denseOriginNvdb = tensorToCoord(denseOrigins);
         // NanoVDB coordinates are int32
-        torch::Tensor denseOriginsI32 = denseOrigins.tensorValue(grid->batchSize(), false /*onlyPositive*/, "dense_origins").to(denseData.device());
+        torch::Tensor denseOriginsI32 =
+            denseOrigins.tensorValue(grid->batchSize(), false /*onlyPositive*/, "dense_origins")
+                .to(denseData.device());
 
         FVDB_DISPATCH_KERNEL_DEVICE(grid->device(), [&]() {
-            ops::dispatchReadFromDense<DeviceTag>(
-                *grid, denseDataReshape, denseOriginsI32, ret, false);
+            ops::dispatchReadFromDense<DeviceTag>(*grid, denseDataReshape, denseOriginsI32, ret,
+                                                  false);
         });
 
         // Reshape [B, N, -1] to [B, N, *] given [B, W, H, D, *]
-        torch::Tensor retReshape = ret.view(
-            spliceShape({grid->totalVoxels()}, denseData, 4));
+        torch::Tensor retReshape = ret.view(spliceShape({ grid->totalVoxels() }, denseData, 4));
 
         // Save shape information for backward
         ctx->saved_data["dense_origin"] = denseOriginsI32;
-        ctx->saved_data["grid_size"] = coordToTensor(nanovdb::Coord(denseData.size(1), denseData.size(2), denseData.size(3)));
-        ctx->saved_data["grid"] = grid;
-        ctx->saved_data["dummy_tensor"] = torch::empty({0}, denseData.options());
-        torch::Tensor retShape = torch::empty({(int64_t) denseData.dim()}, torch::TensorOptions().dtype(torch::kLong));
+        ctx->saved_data["grid_size"] =
+            coordToTensor(nanovdb::Coord(denseData.size(1), denseData.size(2), denseData.size(3)));
+        ctx->saved_data["grid"]         = grid;
+        ctx->saved_data["dummy_tensor"] = torch::empty({ 0 }, denseData.options());
+        torch::Tensor retShape =
+            torch::empty({ (int64_t)denseData.dim() }, torch::TensorOptions().dtype(torch::kLong));
         auto acc = retShape.accessor<int64_t, 1>();
         for (int i = 0; i < denseData.dim(); i++) {
             acc[i] = denseData.size(i);
         }
         ctx->saved_data["final_shape"] = retShape;
 
-        return variable_list({retReshape});  // [N, *]
+        return variable_list({ retReshape }); // [N, *]
     }
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output) {
-
+    static variable_list
+    backward(AutogradContext *ctx, variable_list grad_output) {
         // Use data saved in forward
-        torch::Tensor denseOrigins = ctx->saved_data["dense_origin"].toTensor();  // [B, 3]
-        nanovdb::Coord gridSize = tensorToCoord(ctx->saved_data["grid_size"].toTensor());
-        auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+        torch::Tensor        denseOrigins  = ctx->saved_data["dense_origin"].toTensor(); // [B, 3]
+        nanovdb::Coord       gridSize      = tensorToCoord(ctx->saved_data["grid_size"].toTensor());
+        auto                 grid          = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
         torch::TensorOptions denseDataOpts = ctx->saved_data["dummy_tensor"].toTensor().options();
-        std::vector<int64_t> finalShapeTensor = intTensor1DToStdVector(ctx->saved_data["final_shape"].toTensor());
+        std::vector<int64_t> finalShapeTensor =
+            intTensor1DToStdVector(ctx->saved_data["final_shape"].toTensor());
 
-        Variable gradOut = grad_output.at(0); // [N, *]
+        Variable      gradOut        = grad_output.at(0);             // [N, *]
         torch::Tensor gradOutReshape = featureCoalescedView(gradOut); // [N, -1]
-        torch::Tensor ret = torch::zeros({grid->batchSize(), gridSize[0], gridSize[1], gridSize[2], gradOutReshape.size(1)}, denseDataOpts);  // [B, W, H, D, -1]
+        torch::Tensor ret            = torch::zeros(
+            { grid->batchSize(), gridSize[0], gridSize[1], gridSize[2], gradOutReshape.size(1) },
+            denseDataOpts);                                // [B, W, H, D, -1]
 
         FVDB_DISPATCH_KERNEL_DEVICE(grid->device(), [&]() {
             ops::dispatchReadIntoDense<DeviceTag>(*grid, gradOutReshape, denseOrigins, ret, false);
@@ -91,10 +96,12 @@ struct ReadFromDense : public torch::autograd::Function<ReadFromDense> {
 
         torch::Tensor retReshape = ret.view(finalShapeTensor); // [B, W, H, D, *]
 
-        return {torch::Tensor(), retReshape, torch::Tensor()};
+        return { torch::Tensor(), retReshape, torch::Tensor() };
     }
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_READFROMDENSE_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/ReadIntoDense.cpp b/fvdb/src/detail/autograd/ReadIntoDense.cpp
index 5d05aef23d..9572107ce4 100644
--- a/fvdb/src/detail/autograd/ReadIntoDense.cpp
+++ b/fvdb/src/detail/autograd/ReadIntoDense.cpp
@@ -3,97 +3,113 @@
 //
 #include "ReadIntoDense.h"
 
-#include <nanovdb/NanoVDB.h>
-
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
+#include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-ReadIntoDense::variable_list ReadIntoDense::forward(ReadIntoDense::AutogradContext *ctx,
-                                                    c10::intrusive_ptr<GridBatchImpl> grid,
-                                                    ReadIntoDense::Variable sparseData,
-                                                    const torch::optional<Vec3iBatch>& maybeMinCoord,
-                                                    const torch::optional<Vec3i>& maybeGridSize) {
+ReadIntoDense::variable_list
+ReadIntoDense::forward(ReadIntoDense::AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                       ReadIntoDense::Variable            sparseData,
+                       const torch::optional<Vec3iBatch> &maybeMinCoord,
+                       const torch::optional<Vec3i>      &maybeGridSize) {
     TORCH_CHECK_VALUE(sparseData.dim() > 1, "sparse_data must have shape [num_voxels, *]");
-    TORCH_CHECK_VALUE(sparseData.size(0) == grid->totalVoxels(), "sparseData must have shape (num_voxels, *) where num_voxels = " + std::to_string(grid->totalVoxels()));
+    TORCH_CHECK_VALUE(sparseData.size(0) == grid->totalVoxels(),
+                      "sparseData must have shape (num_voxels, *) where num_voxels = " +
+                          std::to_string(grid->totalVoxels()));
     TORCH_CHECK_VALUE(sparseData.is_contiguous(), "sparse_data must be contiguous");
     grid->checkDevice(sparseData);
 
     // Non empty
     grid->checkNonEmptyGrid();
 
-    nanovdb::CoordBBox gridbb = grid->totalBBox(); // FIXME: Batched should use maximum bounding box which we need to compute
+    nanovdb::CoordBBox gridbb = grid->totalBBox(); // FIXME: Batched should use maximum bounding box
+                                                   // which we need to compute
 
-    // Min coord is an integer tensor of shape [3,] or [B, 3] representing the minimum coordinate of the dense tensor
+    // Min coord is an integer tensor of shape [3,] or [B, 3] representing the minimum coordinate of
+    // the dense tensor
     torch::Tensor denseOrigins;
     if (maybeMinCoord.has_value()) {
-        denseOrigins = maybeMinCoord.value().tensorValue(grid->batchSize(), false /*onlyPositive*/, "min_coord").to(sparseData.device());
+        denseOrigins = maybeMinCoord.value()
+                           .tensorValue(grid->batchSize(), false /*onlyPositive*/, "min_coord")
+                           .to(sparseData.device());
     } else {
-        denseOrigins = coordToTensor(gridbb.min()).to(torch::kInt32).unsqueeze(0).repeat({grid->batchSize(), 1}).to(sparseData.device());
+        denseOrigins = coordToTensor(gridbb.min())
+                           .to(torch::kInt32)
+                           .unsqueeze(0)
+                           .repeat({ grid->batchSize(), 1 })
+                           .to(sparseData.device());
     }
     TORCH_CHECK_VALUE(denseOrigins.dim() == 2, "min_coord must have shape [3,] or [B, 3]");
-    TORCH_CHECK_VALUE(denseOrigins.size(0) == grid->batchSize(), "min_coord must have shape [3,] or [B, 3]");
+    TORCH_CHECK_VALUE(denseOrigins.size(0) == grid->batchSize(),
+                      "min_coord must have shape [3,] or [B, 3]");
     TORCH_CHECK_VALUE(denseOrigins.size(1) == 3, "min_coord must have shape [3,] or [B, 3]");
 
     nanovdb::Coord gridSize = gridbb.dim();
     if (maybeGridSize.has_value()) {
         gridSize = maybeGridSize.value().value();
     }
-    TORCH_CHECK_VALUE(gridSize[0] >= 0 && gridSize[1] >= 0 && gridSize[2] >= 0, "grid_size must be non-negative");
+    TORCH_CHECK_VALUE(gridSize[0] >= 0 && gridSize[1] >= 0 && gridSize[2] >= 0,
+                      "grid_size must be non-negative");
 
-    torch::Tensor sparseDataReshape = featureCoalescedView(sparseData);  // [N, -1]
+    torch::Tensor sparseDataReshape = featureCoalescedView(sparseData); // [N, -1]
     TORCH_CHECK_VALUE(sparseDataReshape.is_contiguous(), "sparse_data must be contiguous");
-    torch::Tensor ret = torch::zeros({grid->batchSize(), gridSize[0], gridSize[1], gridSize[2], sparseDataReshape.size(1)}, sparseData.options()); // [B, W, H, D, -1]
+    torch::Tensor ret = torch::zeros(
+        { grid->batchSize(), gridSize[0], gridSize[1], gridSize[2], sparseDataReshape.size(1) },
+        sparseData.options()); // [B, W, H, D, -1]
     FVDB_DISPATCH_KERNEL_DEVICE(grid->device(), [&]() {
         ops::dispatchReadIntoDense<DeviceTag>(*grid, sparseDataReshape, denseOrigins, ret, false);
     });
-    torch::Tensor retReshape = ret.view(spliceShape({grid->batchSize(), gridSize[0], gridSize[1], gridSize[2]}, sparseData));
+    torch::Tensor retReshape = ret.view(
+        spliceShape({ grid->batchSize(), gridSize[0], gridSize[1], gridSize[2] }, sparseData));
     TORCH_CHECK(retReshape.is_contiguous(), "retReshape must be contiguous");
 
     // Save shape information for backward
     ctx->saved_data["dense_origins"] = denseOrigins;
-    ctx->saved_data["grid_size"] = coordToTensor(gridSize);
-    torch::Tensor retShape = torch::empty({(int64_t) sparseData.dim()}, torch::TensorOptions().dtype(torch::kLong));
+    ctx->saved_data["grid_size"]     = coordToTensor(gridSize);
+    torch::Tensor retShape =
+        torch::empty({ (int64_t)sparseData.dim() }, torch::TensorOptions().dtype(torch::kLong));
     auto acc = retShape.accessor<int64_t, 1>();
     for (int i = 0; i < sparseData.dim(); i++) {
         acc[i] = sparseData.size(i);
     }
-    ctx->saved_data["final_shape"] = retShape;
-    ctx->saved_data["first_dim"] = sparseDataReshape.size(0);
-    ctx->saved_data["last_dim"] = sparseDataReshape.size(1);
-    ctx->saved_data["dummy_tensor"] = torch::empty({0}, sparseData.options());
-    ctx->saved_data["grid"] = grid;
+    ctx->saved_data["final_shape"]  = retShape;
+    ctx->saved_data["first_dim"]    = sparseDataReshape.size(0);
+    ctx->saved_data["last_dim"]     = sparseDataReshape.size(1);
+    ctx->saved_data["dummy_tensor"] = torch::empty({ 0 }, sparseData.options());
+    ctx->saved_data["grid"]         = grid;
 
-    return variable_list({retReshape});
+    return variable_list({ retReshape });
 }
 
-ReadIntoDense::variable_list ReadIntoDense::backward(ReadIntoDense::AutogradContext *ctx,
-                                                     ReadIntoDense::variable_list grad_output) {
-
+ReadIntoDense::variable_list
+ReadIntoDense::backward(ReadIntoDense::AutogradContext *ctx,
+                        ReadIntoDense::variable_list    grad_output) {
     // Use data saved in forward
-    torch::Tensor denseOrigins = ctx->saved_data["dense_origins"].toTensor();  // [B, 3]
-    int64_t firstDim = ctx->saved_data["first_dim"].toInt();
-    int64_t lastDim = ctx->saved_data["last_dim"].toInt();
-    std::vector<int64_t> finalShapeTensor = intTensor1DToStdVector(ctx->saved_data["final_shape"].toTensor());
+    torch::Tensor        denseOrigins = ctx->saved_data["dense_origins"].toTensor(); // [B, 3]
+    int64_t              firstDim     = ctx->saved_data["first_dim"].toInt();
+    int64_t              lastDim      = ctx->saved_data["last_dim"].toInt();
+    std::vector<int64_t> finalShapeTensor =
+        intTensor1DToStdVector(ctx->saved_data["final_shape"].toTensor());
     torch::TensorOptions sparseDataOpts = ctx->saved_data["dummy_tensor"].toTensor().options();
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    Variable gradOut = grad_output.at(0);  // [B, W, H, D, *]
+    auto                 grid           = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    Variable             gradOut        = grad_output.at(0);                 // [B, W, H, D, *]
 
-    torch::Tensor gradOutReshape = featureCoalescedView(gradOut, 4);  // [B, W, H, D, -1]
+    torch::Tensor gradOutReshape = featureCoalescedView(gradOut, 4);         // [B, W, H, D, -1]
 
-    torch::Tensor ret = torch::zeros({firstDim, lastDim}, sparseDataOpts);  // [N, -1]
+    torch::Tensor ret = torch::zeros({ firstDim, lastDim }, sparseDataOpts); // [N, -1]
 
     FVDB_DISPATCH_KERNEL_DEVICE(grid->device(), [&]() {
         ops::dispatchReadFromDense<DeviceTag>(*grid, gradOutReshape, denseOrigins, ret, false);
     });
 
-    torch::Tensor retReshape = ret.view(finalShapeTensor);  // [N, *]
+    torch::Tensor retReshape = ret.view(finalShapeTensor); // [N, *]
 
-    return {torch::Tensor(), retReshape, torch::Tensor(), torch::Tensor()};
+    return { torch::Tensor(), retReshape, torch::Tensor(), torch::Tensor() };
 }
 
 } // namespace autograd
diff --git a/fvdb/src/detail/autograd/ReadIntoDense.h b/fvdb/src/detail/autograd/ReadIntoDense.h
index c2d8736e97..d6bed67feb 100644
--- a/fvdb/src/detail/autograd/ReadIntoDense.h
+++ b/fvdb/src/detail/autograd/ReadIntoDense.h
@@ -1,7 +1,8 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_READINTODENSE_H
+#define FVDB_DETAIL_AUTOGRAD_READINTODENSE_H
 
 #include <torch/autograd.h>
 
@@ -14,20 +15,20 @@ namespace detail {
 namespace autograd {
 
 struct ReadIntoDense : public torch::autograd::Function<ReadIntoDense> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 Variable sparseData,
-                                 const torch::optional<Vec3iBatch>& maybeMinCoord,
-                                 const torch::optional<Vec3i>& maybeGridSize);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 Variable                           sparseData,
+                                 const torch::optional<Vec3iBatch> &maybeMinCoord,
+                                 const torch::optional<Vec3i>      &maybeGridSize);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_READINTODENSE_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SampleGrid.cpp b/fvdb/src/detail/autograd/SampleGrid.cpp
index 9cf5fd4c10..7a87a6b756 100644
--- a/fvdb/src/detail/autograd/SampleGrid.cpp
+++ b/fvdb/src/detail/autograd/SampleGrid.cpp
@@ -3,45 +3,44 @@
 //
 #include "SampleGrid.h"
 
-#include "detail/ops/Ops.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
-#include "detail/utils/Utils.h"
-
-
-
-
-void checkForwardInputs(c10::intrusive_ptr<fvdb::detail::GridBatchImpl> grid,
-                        fvdb::detail::autograd::SampleGridTrilinear::JaggedVariable points,
-                        fvdb::detail::autograd::SampleGridTrilinear::Variable data,
-                        bool returnGrad) {
+void
+checkForwardInputs(c10::intrusive_ptr<fvdb::detail::GridBatchImpl>             grid,
+                   fvdb::detail::autograd::SampleGridTrilinear::JaggedVariable points,
+                   fvdb::detail::autograd::SampleGridTrilinear::Variable data, bool returnGrad) {
     grid->checkNonEmptyGrid();
-    TORCH_CHECK_VALUE(points.device() == data.device(), "points and data must be on the same device");
+    TORCH_CHECK_VALUE(points.device() == data.device(),
+                      "points and data must be on the same device");
     grid->checkDevice(points);
     grid->checkDevice(data);
     points.check_valid();
 
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
     TORCH_CHECK_TYPE(points.dtype() == data.dtype(), "all tensors must have the same type");
-    TORCH_CHECK_VALUE(points.rdim() == 2, "Expected points to have shape [B*M, 3] (wrong number of dimensions)");
+    TORCH_CHECK_VALUE(points.rdim() == 2,
+                      "Expected points to have shape [B*M, 3] (wrong number of dimensions)");
     TORCH_CHECK(points.numel() > 0, "Empty tensor (points)");
     TORCH_CHECK(points.rsize(1) == 3, "points must have shape [B, M, 3] (points must be 3D)");
 
     TORCH_CHECK_TYPE(data.is_floating_point(), "data must have a floating point type");
-    TORCH_CHECK_VALUE(data.dim() >= 2, "Expected data to have shape [N, *] (at least 2 dimensions)");
+    TORCH_CHECK_VALUE(data.dim() >= 2,
+                      "Expected data to have shape [N, *] (at least 2 dimensions)");
     TORCH_CHECK(data.numel() > 0, "Empty tensor (data)");
-    TORCH_CHECK(data.size(0) == grid->totalVoxels(), "grid_data must have one value per voxel (shape [N, *]) (wrong first dimension)");
+    TORCH_CHECK(data.size(0) == grid->totalVoxels(),
+                "grid_data must have one value per voxel (shape [N, *]) (wrong first dimension)");
 }
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-SampleGridTrilinear::variable_list SampleGridTrilinear::forward(SampleGridTrilinear::AutogradContext *ctx,
-                                                                c10::intrusive_ptr<GridBatchImpl> grid,
-                                                                SampleGridTrilinear::JaggedVariable points,
-                                                                SampleGridTrilinear::Variable data,
-                                                                bool returnGrad) {
+SampleGridTrilinear::variable_list
+SampleGridTrilinear::forward(SampleGridTrilinear::AutogradContext *ctx,
+                             c10::intrusive_ptr<GridBatchImpl>     grid,
+                             SampleGridTrilinear::JaggedVariable   points,
+                             SampleGridTrilinear::Variable data, bool returnGrad) {
     checkForwardInputs(grid, points, data, returnGrad);
 
     auto ret = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
@@ -53,59 +52,54 @@ SampleGridTrilinear::variable_list SampleGridTrilinear::forward(SampleGridTrilin
     });
 
     // Save data for backward in context
-    ctx->save_for_backward({data, points.jdata(), points.joffsets(), points.jlidx()});
-    ctx->saved_data["grid"] = grid;
+    ctx->save_for_backward({ data, points.jdata(), points.joffsets(), points.jlidx() });
+    ctx->saved_data["grid"]        = grid;
     ctx->saved_data["return_grad"] = returnGrad;
     return ret;
 }
 
-
-
-
-SampleGridTrilinear::variable_list SampleGridTrilinear::backward(SampleGridTrilinear::AutogradContext *ctx,
-                                                                 SampleGridTrilinear::variable_list grad_output) {
-
+SampleGridTrilinear::variable_list
+SampleGridTrilinear::backward(SampleGridTrilinear::AutogradContext *ctx,
+                              SampleGridTrilinear::variable_list    grad_output) {
     // Use data saved in forward
     variable_list saved = ctx->get_saved_variables();
-    Variable data = saved.at(0);
+    Variable      data  = saved.at(0);
 
-    Variable pointCoords = saved.at(1);
+    Variable pointCoords   = saved.at(1);
     Variable pointJOffsets = saved.at(2);
-    Variable pointsJLidx = saved.at(3);
+    Variable pointsJLidx   = saved.at(3);
 
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    bool returnGrad = ctx->saved_data["return_grad"].toBool();
-    Variable gradOut = grad_output.at(0);  // [B*M, *]
+    auto     grid       = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    bool     returnGrad = ctx->saved_data["return_grad"].toBool();
+    Variable gradOut    = grad_output.at(0); // [B*M, *]
 
     torch::Tensor outGrad = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         if (returnGrad) {
-            Variable gradPtsGrad = grad_output.at(1);  // [B*M, -1, 3]
+            Variable gradPtsGrad = grad_output.at(1); // [B*M, -1, 3]
             return ops::dispatchSampleGridTrilinearWithGradBackward<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), data, gradOut, gradPtsGrad);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets,
+                                                             pointsJLidx),
+                data, gradOut, gradPtsGrad);
         } else {
             return ops::dispatchSplatIntoGridTrilinear<DeviceTag>(
-                    *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), gradOut);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets,
+                                                             pointsJLidx),
+                gradOut);
         }
     });
 
-    return {torch::Tensor(), torch::Tensor(), outGrad, torch::Tensor()};
+    return { torch::Tensor(), torch::Tensor(), outGrad, torch::Tensor() };
 }
 
-
-
-
-
-
-
-
-SampleGridBezier::variable_list SampleGridBezier::forward(SampleGridBezier::AutogradContext *ctx,
-                                                          c10::intrusive_ptr<GridBatchImpl> grid,
-                                                          SampleGridBezier::JaggedVariable points,
-                                                          SampleGridBezier::Variable data,
-                                                          bool returnGrad) {
+SampleGridBezier::variable_list
+SampleGridBezier::forward(SampleGridBezier::AutogradContext *ctx,
+                          c10::intrusive_ptr<GridBatchImpl>  grid,
+                          SampleGridBezier::JaggedVariable points, SampleGridBezier::Variable data,
+                          bool returnGrad) {
     checkForwardInputs(grid, points, data, returnGrad);
 
-
     std::vector<torch::Tensor> ret = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
         if (returnGrad) {
             return ops::dispatchSampleGridBezierWithGrad<DeviceTag>(*grid, points, data);
@@ -115,45 +109,48 @@ SampleGridBezier::variable_list SampleGridBezier::forward(SampleGridBezier::Auto
     });
 
     // Save data for backward in context
-    ctx->save_for_backward({data, points.jdata(), points.joffsets(), points.jlidx()});
-    ctx->saved_data["grid"] = grid;
+    ctx->save_for_backward({ data, points.jdata(), points.joffsets(), points.jlidx() });
+    ctx->saved_data["grid"]        = grid;
     ctx->saved_data["return_grad"] = returnGrad;
 
     return ret;
 }
 
-
-SampleGridBezier::variable_list SampleGridBezier::backward(SampleGridBezier::AutogradContext *ctx,
-                                                           SampleGridBezier::variable_list grad_output) {
-
+SampleGridBezier::variable_list
+SampleGridBezier::backward(SampleGridBezier::AutogradContext *ctx,
+                           SampleGridBezier::variable_list    grad_output) {
     // Use data saved in forward
     variable_list saved = ctx->get_saved_variables();
-    Variable data = saved.at(0);
+    Variable      data  = saved.at(0);
 
-    Variable pointCoords = saved.at(1);
+    Variable pointCoords   = saved.at(1);
     Variable pointJOffsets = saved.at(2);
-    Variable pointsJLidx = saved.at(3);
+    Variable pointsJLidx   = saved.at(3);
 
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    bool returnGrad = ctx->saved_data["return_grad"].toBool();
-    Variable gradOut = grad_output.at(0);  // [B*M, *]
+    auto     grid       = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    bool     returnGrad = ctx->saved_data["return_grad"].toBool();
+    Variable gradOut    = grad_output.at(0); // [B*M, *]
 
     Variable outGrad = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         if (returnGrad) {
-            Variable gradPtsGrad = grad_output.at(1);  // [B*M, -1, 3]
+            Variable gradPtsGrad = grad_output.at(1); // [B*M, -1, 3]
             return ops::dispatchSampleGridBezierWithGradBackward<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), gradOut, gradPtsGrad, data);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets,
+                                                             pointsJLidx),
+                gradOut, gradPtsGrad, data);
         } else {
             return ops::dispatchSplatIntoGridBezier<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), gradOut);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets,
+                                                             pointsJLidx),
+                gradOut);
         }
     });
 
-    return {torch::Tensor(), torch::Tensor(), outGrad, torch::Tensor()};
+    return { torch::Tensor(), torch::Tensor(), outGrad, torch::Tensor() };
 }
 
-
-
-}  // namespace autograd
-}  // namespace detail
-}  // namespace fvdb
+} // namespace autograd
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/autograd/SampleGrid.h b/fvdb/src/detail/autograd/SampleGrid.h
index 424c6a3702..2e35dc4576 100644
--- a/fvdb/src/detail/autograd/SampleGrid.h
+++ b/fvdb/src/detail/autograd/SampleGrid.h
@@ -1,51 +1,43 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_SAMPLEGRID_H
+#define FVDB_DETAIL_AUTOGRAD_SAMPLEGRID_H
 
 #include <torch/autograd.h>
 
 #include "detail/GridBatchImpl.h"
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-
 struct SampleGridTrilinear : public torch::autograd::Function<SampleGridTrilinear> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-    using JaggedVariable = JaggedTensor;
+    using Variable        = torch::autograd::Variable;
+    using JaggedVariable  = JaggedTensor;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 JaggedTensor points,
-                                 Variable data,
-                                 bool returnGrad = false);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 JaggedTensor points, Variable data, bool returnGrad = false);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
-
 struct SampleGridBezier : public torch::autograd::Function<SampleGridBezier> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-    using JaggedVariable = JaggedTensor;
+    using Variable        = torch::autograd::Variable;
+    using JaggedVariable  = JaggedTensor;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 JaggedTensor points,
-                                 Variable data,
-                                 bool returnGrad = false);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 JaggedTensor points, Variable data, bool returnGrad = false);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_SAMPLEGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SparseConvolutionHalo.cpp b/fvdb/src/detail/autograd/SparseConvolutionHalo.cpp
index 323cb9e226..64db08db78 100644
--- a/fvdb/src/detail/autograd/SparseConvolutionHalo.cpp
+++ b/fvdb/src/detail/autograd/SparseConvolutionHalo.cpp
@@ -3,66 +3,71 @@
 //
 #include "SparseConvolutionHalo.h"
 
-#include "detail/ops/convolution/backend/ConvOps.h"
-
-#include "detail/utils/Utils.h"
+#include <detail/ops/convolution/backend/ConvOps.h>
+#include <detail/utils/Utils.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-
-SparseConvolutionHalo::variable_list SparseConvolutionHalo::forward(SparseConvolutionHalo::AutogradContext *ctx,
-                                                                    c10::intrusive_ptr<GridBatchImpl> grid,
-                                                                    SparseConvolutionHalo::Variable inFeatures,
-                                                                    SparseConvolutionHalo::Variable kernels,
-                                                                    int variant) {
-
+SparseConvolutionHalo::variable_list
+SparseConvolutionHalo::forward(SparseConvolutionHalo::AutogradContext *ctx,
+                               c10::intrusive_ptr<GridBatchImpl>       grid,
+                               SparseConvolutionHalo::Variable         inFeatures,
+                               SparseConvolutionHalo::Variable kernels, int variant) {
     // Check kernels
     TORCH_CHECK_TYPE(kernels.is_floating_point(), "kernels must have a floating point type");
-    TORCH_CHECK_VALUE(kernels.dim() == 5, std::string("Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
-                      std::to_string(kernels.dim()) + " dimensions");
-    TORCH_CHECK_NOT_IMPLEMENTED(kernels.size(2) == kernels.size(3) && kernels.size(3) == kernels.size(4) && kernels.size(2) == 3,
-                        "sparse_conv_halo only supports kernels of size 3x3x3");
+    TORCH_CHECK_VALUE(
+        kernels.dim() == 5,
+        std::string(
+            "Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
+            std::to_string(kernels.dim()) + " dimensions");
+    TORCH_CHECK_NOT_IMPLEMENTED(kernels.size(2) == kernels.size(3) &&
+                                    kernels.size(3) == kernels.size(4) && kernels.size(2) == 3,
+                                "sparse_conv_halo only supports kernels of size 3x3x3");
 
     // Check features
     TORCH_CHECK_VALUE(inFeatures.is_contiguous(), "features must be contiguous");
     TORCH_CHECK_TYPE(inFeatures.is_floating_point(), "features must have a floating point type");
-    TORCH_CHECK_VALUE(inFeatures.dim() == 2, std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
-                                                std::to_string(inFeatures.dim()) + " dimensions");
-    TORCH_CHECK_VALUE(kernels.size(1) == inFeatures.size(1),
-                "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
-                ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
+    TORCH_CHECK_VALUE(
+        inFeatures.dim() == 2,
+        std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
+            std::to_string(inFeatures.dim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        kernels.size(1) == inFeatures.size(1),
+        "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
+            ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
 
     // [O, I, 3, 3, 3] to [3, 3, 3, I, O]
-    kernels = kernels.permute({4, 3, 2, 1, 0}).contiguous();
+    kernels = kernels.permute({ 4, 3, 2, 1, 0 }).contiguous();
 
     torch::Tensor outFeatures = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
         return ops::dispatchSparseConvolutionHalo<DeviceTag>(*grid, inFeatures, kernels, variant);
     });
 
     // Save data for backward in context
-    ctx->save_for_backward({inFeatures, kernels});
-    ctx->saved_data["grid"] = grid;
+    ctx->save_for_backward({ inFeatures, kernels });
+    ctx->saved_data["grid"]    = grid;
     ctx->saved_data["variant"] = variant;
 
-    return variable_list({outFeatures});
+    return variable_list({ outFeatures });
 }
 
-
-SparseConvolutionHalo::variable_list SparseConvolutionHalo::backward(AutogradContext *ctx, variable_list grad_output) {
-
+SparseConvolutionHalo::variable_list
+SparseConvolutionHalo::backward(AutogradContext *ctx, variable_list grad_output) {
     variable_list saved = ctx->get_saved_variables();
-    TORCH_CHECK(saved.size() > 0, "No backward context computed during forward. Please pass in training=True when calling kmap.build_implicit_gemm()");
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    int variant = ctx->saved_data["variant"].toInt();
+    TORCH_CHECK(
+        saved.size() > 0,
+        "No backward context computed during forward. Please pass in training=True when calling kmap.build_implicit_gemm()");
+    auto grid    = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    int  variant = ctx->saved_data["variant"].toInt();
 
     Variable inFeatures = saved.at(0);
-    Variable kernels = saved.at(1);         // [3, 3, 3, I, O]
-    Variable gradOut = grad_output.at(0);
+    Variable kernels    = saved.at(1);                                           // [3, 3, 3, I, O]
+    Variable gradOut    = grad_output.at(0);
 
-    kernels = kernels.permute({0, 1, 2, 4, 3}).flip({0, 1, 2}).contiguous();  // [3, 3, 3, O, I]
-    torch::Tensor gradInput = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
+    kernels = kernels.permute({ 0, 1, 2, 4, 3 }).flip({ 0, 1, 2 }).contiguous(); // [3, 3, 3, O, I]
+    torch::Tensor gradInput  = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
         return ops::dispatchSparseConvolutionHalo<DeviceTag>(*grid, gradOut, kernels, variant);
     });
     torch::Tensor gradKernel = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
@@ -70,12 +75,11 @@ SparseConvolutionHalo::variable_list SparseConvolutionHalo::backward(AutogradCon
     });
 
     // [3, 3, 3, I, O] to [O, I, 3, 3, 3]
-    gradKernel = gradKernel.permute({4, 3, 2, 1, 0}).contiguous();
+    gradKernel = gradKernel.permute({ 4, 3, 2, 1, 0 }).contiguous();
 
-    return {torch::Tensor(), gradInput, gradKernel, torch::Tensor()};
+    return { torch::Tensor(), gradInput, gradKernel, torch::Tensor() };
 }
 
-
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SparseConvolutionHalo.h b/fvdb/src/detail/autograd/SparseConvolutionHalo.h
index 5d5cb2e1a5..eb577a900f 100644
--- a/fvdb/src/detail/autograd/SparseConvolutionHalo.h
+++ b/fvdb/src/detail/autograd/SparseConvolutionHalo.h
@@ -1,29 +1,25 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONHALO_H
+#define FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONHALO_H
 
-#include <torch/autograd.h>
-
-#include "detail/ops/Ops.h"
-
-#include "SparseConvPackInfo.h"
+#include <SparseConvPackInfo.h>
+#include <detail/ops/Ops.h>
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct SparseConvolutionHalo : public torch::autograd::Function<SparseConvolutionHalo> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 Variable inFeatures,
-                                 Variable kernels,
-                                 int variant);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 Variable inFeatures, Variable kernels, int variant);
 
     static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
@@ -31,3 +27,5 @@ struct SparseConvolutionHalo : public torch::autograd::Function<SparseConvolutio
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONHALO_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SparseConvolutionImplicitGEMM.h b/fvdb/src/detail/autograd/SparseConvolutionImplicitGEMM.h
index 0d43769c90..1e96f25c38 100644
--- a/fvdb/src/detail/autograd/SparseConvolutionImplicitGEMM.h
+++ b/fvdb/src/detail/autograd/SparseConvolutionImplicitGEMM.h
@@ -1,28 +1,27 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONIMPLICITGEMM_H
+#define FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONIMPLICITGEMM_H
 
-#include <torch/autograd.h>
-
-#include "detail/ops/Ops.h"
-
-#include "SparseConvPackInfo.h"
+#include <SparseConvPackInfo.h>
+#include <detail/ops/convolution/backend/ConvOps.h>
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-struct SparseConvolutionImplicitGEMM : public torch::autograd::Function<SparseConvolutionImplicitGEMM> {
-    using variable_list = torch::autograd::variable_list;
+struct SparseConvolutionImplicitGEMM
+    : public torch::autograd::Function<SparseConvolutionImplicitGEMM> {
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 Variable inFeatures, Variable kernels,
-                                 SparseConvPackInfo packInfo,
-                                 bool transposed) {
+    static variable_list
+    forward(AutogradContext *ctx, Variable inFeatures, Variable kernels,
+            SparseConvPackInfo packInfo, bool transposed) {
         if (transposed) {
             packInfo = packInfo.transposed();
         }
@@ -30,155 +29,178 @@ struct SparseConvolutionImplicitGEMM : public torch::autograd::Function<SparseCo
         TORCH_CHECK(packInfo.outInMap().has_value(),
                     "Out In Map must be built for IGEMM sparse convolution");
         torch::Tensor outInMap = packInfo.outInMap().value();
-        bool useTF32 = packInfo.useTF32();
+        bool          useTF32  = packInfo.useTF32();
 
-        const std::vector<int> sizes = { (int) packInfo.sourceGrid().total_voxels(), (int) packInfo.targetGrid().total_voxels() };
-        TORCH_CHECK(packInfo.sourceGrid().is_mutable() == packInfo.targetGrid().is_mutable(), "Source and target grids must both be mutable or immutable");
+        const std::vector<int> sizes = { (int)packInfo.sourceGrid().total_voxels(),
+                                         (int)packInfo.targetGrid().total_voxels() };
+        TORCH_CHECK(packInfo.sourceGrid().is_mutable() == packInfo.targetGrid().is_mutable(),
+                    "Source and target grids must both be mutable or immutable");
 
         // Check features and kernels
         TORCH_CHECK_VALUE(inFeatures.is_contiguous(), "features must be contiguous");
-        TORCH_CHECK_TYPE(inFeatures.is_floating_point(), "features must have a floating point type");
-        TORCH_CHECK_VALUE(inFeatures.dim() == 2, std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
-                                                 std::to_string(inFeatures.dim()) + " dimensions");
+        TORCH_CHECK_TYPE(inFeatures.is_floating_point(),
+                         "features must have a floating point type");
+        TORCH_CHECK_VALUE(
+            inFeatures.dim() == 2,
+            std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
+                std::to_string(inFeatures.dim()) + " dimensions");
         TORCH_CHECK_TYPE(kernels.is_floating_point(), "kernels must have a floating point type");
         for (int i = 0; i < kernels.dim(); i += 1) {
-            TORCH_CHECK_VALUE(kernels.size(i) != 0, "kernels tensor has zero dimension (dim = " + std::to_string(i) + ")");
+            TORCH_CHECK_VALUE(kernels.size(i) != 0, "kernels tensor has zero dimension (dim = " +
+                                                        std::to_string(i) + ")");
         }
 
-        auto opt = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
-        torch::Tensor kWidth = torch::empty({3,}, opt);
+        auto          opt    = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCPU);
+        torch::Tensor kWidth = torch::empty(
+            {
+                3,
+            },
+            opt);
         int inC, outC;
         if (!transposed) {
-            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[0], "The number of input features must match the number of voxels");
-            TORCH_CHECK_VALUE(kernels.dim() == 5, std::string("Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
-                                        std::to_string(kernels.dim()) + " dimensions");
-            TORCH_CHECK_VALUE(kernels.size(1) == inFeatures.size(1),
-                            "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
-                            ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
-            outC = kernels.size(0); inC = kernels.size(1);
+            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[0],
+                              "The number of input features must match the number of voxels");
+            TORCH_CHECK_VALUE(
+                kernels.dim() == 5,
+                std::string(
+                    "Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
+                    std::to_string(kernels.dim()) + " dimensions");
+            TORCH_CHECK_VALUE(
+                kernels.size(1) == inFeatures.size(1),
+                "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
+                    ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
+            outC      = kernels.size(0);
+            inC       = kernels.size(1);
             kWidth[0] = kernels.size(2);
             kWidth[1] = kernels.size(3);
             kWidth[2] = kernels.size(4);
-            kernels = kernels.permute({4, 3, 2, 1, 0}).reshape({-1, inC, outC}).contiguous();
+            kernels   = kernels.permute({ 4, 3, 2, 1, 0 }).reshape({ -1, inC, outC }).contiguous();
         } else {
-            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[1], "The number of input features must match the number of voxels");
-            TORCH_CHECK_VALUE(kernels.dim() == 5, std::string("Expected kernels to have 5 dimensions (shape (in_ch, out_ch, d, h, w)) but got ") +
-                                        std::to_string(kernels.dim()) + " dimensions");
-            TORCH_CHECK_VALUE(kernels.size(0) == inFeatures.size(1),
-                            "Expected input channels of kernels (" + std::to_string(kernels.size(0)) +
-                            ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
-            int inC = kernels.size(0); outC = kernels.size(1);
+            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[1],
+                              "The number of input features must match the number of voxels");
+            TORCH_CHECK_VALUE(
+                kernels.dim() == 5,
+                std::string(
+                    "Expected kernels to have 5 dimensions (shape (in_ch, out_ch, d, h, w)) but got ") +
+                    std::to_string(kernels.dim()) + " dimensions");
+            TORCH_CHECK_VALUE(
+                kernels.size(0) == inFeatures.size(1),
+                "Expected input channels of kernels (" + std::to_string(kernels.size(0)) +
+                    ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
+            int inC   = kernels.size(0);
+            outC      = kernels.size(1);
             kWidth[0] = kernels.size(2);
             kWidth[1] = kernels.size(3);
             kWidth[2] = kernels.size(4);
-            kernels = kernels.permute({4, 3, 2, 0, 1}).reshape({-1, inC, outC}).contiguous();
+            kernels   = kernels.permute({ 4, 3, 2, 0, 1 }).reshape({ -1, inC, outC }).contiguous();
         }
 
         torch::Tensor output;
         if (packInfo.targetGrid().total_voxels() > 0) {
             int outFeats = transposed ? sizes[0] : sizes[1];
             // Emprically larger kernels do not work right now, default to non-sorted version.
-            bool canSort = !transposed && (packInfo.kernelSize().value() < fvdb::Vec3iOrScalar(4).value());
+            bool canSort =
+                !transposed && (packInfo.kernelSize().value() < fvdb::Vec3iOrScalar(4).value());
             if (packInfo.reoderOutInMap().has_value() && canSort) {
                 output = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                     return ops::dispatchSparseConvolutionImplicitGEMMSorted<DeviceTag>(
-                            inFeatures, kernels,
-                            packInfo.reoderOutInMap().value(),
-                            packInfo.reducedSortedMask().value(),
-                            packInfo.reorderLoc().value(),
-                            outFeats, outC, useTF32, true);
+                        inFeatures, kernels, packInfo.reoderOutInMap().value(),
+                        packInfo.reducedSortedMask().value(), packInfo.reorderLoc().value(),
+                        outFeats, outC, useTF32, true);
                 });
             } else {
                 output = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                     return ops::dispatchSparseConvolutionImplicitGEMM<DeviceTag>(
-                            inFeatures, kernels, outInMap, outFeats, outC, useTF32, true);
+                        inFeatures, kernels, outInMap, outFeats, outC, useTF32, true);
                 });
             }
         } else {
             auto opt = torch::TensorOptions().dtype(inFeatures.dtype()).device(inFeatures.device());
-            output = torch::empty({0, kernels.size(-1)}, opt);
+            output   = torch::empty({ 0, kernels.size(-1) }, opt);
         }
 
         // Save for backward (for training mode)
         if (packInfo.outInMapBwd().has_value()) {
-            ctx->save_for_backward({inFeatures, kernels,
-                                    packInfo.outInMapBwd().value(),
-                                    packInfo.reorderOutInMapBwd().value(),
-                                    packInfo.sortedMaskBwdW().value(),
-                                    packInfo.sortedMaskBwdD().value(),
-                                    packInfo.reorderLocBwd().value()});
+            ctx->save_for_backward(
+                { inFeatures, kernels, packInfo.outInMapBwd().value(),
+                  packInfo.reorderOutInMapBwd().value(), packInfo.sortedMaskBwdW().value(),
+                  packInfo.sortedMaskBwdD().value(), packInfo.reorderLocBwd().value() });
         }
-        ctx->saved_data["use_tf32"] = useTF32;
+        ctx->saved_data["use_tf32"]     = useTF32;
         ctx->saved_data["kernel_width"] = kWidth;
-        ctx->saved_data["transposed"] = transposed;
+        ctx->saved_data["transposed"]   = transposed;
 
-        return {output};
+        return { output };
     }
 
-    static variable_list backward(AutogradContext *ctx, variable_list grad_output) {
+    static variable_list
+    backward(AutogradContext *ctx, variable_list grad_output) {
         variable_list saved = ctx->get_saved_variables();
-        TORCH_CHECK(saved.size() > 0, "No backward context computed during forward. Please pass in training=True when calling kmap.build_implicit_gemm()");
-
-        Variable inFeatures = saved.at(0);
-        Variable kernels = saved.at(1);
-        Variable outInMapBwd = saved.at(2);
-        Variable reorderOutInMapBwd = saved.at(3);
-        Variable sortedMaskBwdW = saved.at(4);
-        Variable sortedMaskBwdD = saved.at(5);
-        Variable reorderLocBwd = saved.at(6);
-        bool useTF32 = ctx->saved_data["use_tf32"].toBool();
-        torch::Tensor kWidth = ctx->saved_data["kernel_width"].toTensor();
-        bool transposed = ctx->saved_data["transposed"].toBool();
-
-        Variable gradOut = grad_output.at(0);
+        TORCH_CHECK(
+            saved.size() > 0,
+            "No backward context computed during forward. Please pass in training=True when calling kmap.build_implicit_gemm()");
+
+        Variable      inFeatures         = saved.at(0);
+        Variable      kernels            = saved.at(1);
+        Variable      outInMapBwd        = saved.at(2);
+        Variable      reorderOutInMapBwd = saved.at(3);
+        Variable      sortedMaskBwdW     = saved.at(4);
+        Variable      sortedMaskBwdD     = saved.at(5);
+        Variable      reorderLocBwd      = saved.at(6);
+        bool          useTF32            = ctx->saved_data["use_tf32"].toBool();
+        torch::Tensor kWidth             = ctx->saved_data["kernel_width"].toTensor();
+        bool          transposed         = ctx->saved_data["transposed"].toBool();
+
+        Variable      gradOut = grad_output.at(0);
         torch::Tensor gradInput, gradWeight;
 
         // Dispatching following torchsparse++
         int kernelVolume = kernels.size(0);
-        int inC = kernels.size(1);
-        int outC = kernels.size(2);
+        int inC          = kernels.size(1);
+        int outC         = kernels.size(2);
 
         if (kernelVolume < 32) {
-            gradInput = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
+            gradInput  = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                 return ops::dispatchSparseConvolutionImplicitGEMMSorted<DeviceTag>(
-                        gradOut, kernels.transpose(2, 1).contiguous(),
-                        reorderOutInMapBwd,
-                        sortedMaskBwdD,
-                        reorderLocBwd,
-                        inFeatures.size(0), inC, useTF32, true);
+                    gradOut, kernels.transpose(2, 1).contiguous(), reorderOutInMapBwd,
+                    sortedMaskBwdD, reorderLocBwd, inFeatures.size(0), inC, useTF32, true);
             });
             gradWeight = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                 return ops::dispatchSparseConvolutionImplicitGEMMGradSorted<DeviceTag>(
-                        gradOut, inFeatures, reorderOutInMapBwd,
-                        sortedMaskBwdW, reorderLocBwd, 32,
-                        useTF32, true);
+                    gradOut, inFeatures, reorderOutInMapBwd, sortedMaskBwdW, reorderLocBwd, 32,
+                    useTF32, true);
             });
         } else {
-            gradInput = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
+            gradInput  = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                 return ops::dispatchSparseConvolutionImplicitGEMM<DeviceTag>(
-                        gradOut, kernels.transpose(2, 1).contiguous(),
-                        outInMapBwd,
-                        inFeatures.size(0), inC, useTF32, true);
+                    gradOut, kernels.transpose(2, 1).contiguous(), outInMapBwd, inFeatures.size(0),
+                    inC, useTF32, true);
             });
             gradWeight = FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                 return ops::dispatchSparseConvolutionImplicitGEMMGrad<DeviceTag>(
-                        gradOut, inFeatures, outInMapBwd, 32,
-                        useTF32, true);
+                    gradOut, inFeatures, outInMapBwd, 32, useTF32, true);
             });
         }
 
         if (!transposed) {
-            gradWeight = gradWeight.reshape(
-                {kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(), kWidth[0].item<int32_t>(), outC, inC}).permute({3, 4, 2, 1, 0});
+            gradWeight = gradWeight
+                             .reshape({ kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(),
+                                        kWidth[0].item<int32_t>(), outC, inC })
+                             .permute({ 3, 4, 2, 1, 0 });
         } else {
-            gradWeight = gradWeight.reshape(
-                {kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(), kWidth[0].item<int32_t>(), outC, inC}).permute({4, 3, 2, 1, 0});
+            gradWeight = gradWeight
+                             .reshape({ kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(),
+                                        kWidth[0].item<int32_t>(), outC, inC })
+                             .permute({ 4, 3, 2, 1, 0 });
         }
 
-        return {gradInput, gradWeight, torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor()};
+        return { gradInput,       gradWeight,      torch::Tensor(),
+                 torch::Tensor(), torch::Tensor(), torch::Tensor() };
     }
 };
 
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONIMPLICITGEMM_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SparseConvolutionKernelMap.h b/fvdb/src/detail/autograd/SparseConvolutionKernelMap.h
index c9303b467e..ae68311051 100644
--- a/fvdb/src/detail/autograd/SparseConvolutionKernelMap.h
+++ b/fvdb/src/detail/autograd/SparseConvolutionKernelMap.h
@@ -1,96 +1,116 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONKERNELMAP_H
+#define FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONKERNELMAP_H
 
-#include <torch/autograd.h>
-
-#include "detail/ops/convolution/backend/ConvOps.h"
-
-#include "SparseConvPackInfo.h"
+#include <SparseConvPackInfo.h>
+#include <detail/ops/convolution/backend/ConvOps.h>
 
+#include <torch/autograd.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct SparseConvolutionKernelMap : public torch::autograd::Function<SparseConvolutionKernelMap> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-
-    static variable_list forward(AutogradContext *ctx,
-                                 Variable inFeatures, Variable kernels,
-                                 const SparseConvPackInfo& packInfo,
-                                 bool transposed) {
+    using Variable        = torch::autograd::Variable;
 
+    static variable_list
+    forward(AutogradContext *ctx, Variable inFeatures, Variable kernels,
+            const SparseConvPackInfo &packInfo, bool transposed) {
         TORCH_CHECK(packInfo.neighborMap().has_value() && packInfo.neighborSizes().has_value(),
                     "Neighbor map must be built for sparse convolution");
 
-        torch::Tensor nbmaps = packInfo.neighborMap().value();
-        torch::Tensor nbsizes = packInfo.neighborSizes().value();
-        const std::vector<int> sizes = { (int) packInfo.sourceGrid().total_voxels(), (int) packInfo.targetGrid().total_voxels() };
-        const bool middleAcceleration = !(packInfo.sourceGrid().is_mutable() && packInfo.targetGrid().is_mutable()) && \
-                                    packInfo.stride().value() == Vec3iOrScalar(1).value();
+        torch::Tensor          nbmaps  = packInfo.neighborMap().value();
+        torch::Tensor          nbsizes = packInfo.neighborSizes().value();
+        const std::vector<int> sizes   = { (int)packInfo.sourceGrid().total_voxels(),
+                                           (int)packInfo.targetGrid().total_voxels() };
+        const bool             middleAcceleration =
+            !(packInfo.sourceGrid().is_mutable() && packInfo.targetGrid().is_mutable()) &&
+            packInfo.stride().value() == Vec3iOrScalar(1).value();
 
-        TORCH_CHECK(packInfo.sourceGrid().is_mutable() == packInfo.targetGrid().is_mutable(), "Source and target grids must both be mutable or immutable");
+        TORCH_CHECK(packInfo.sourceGrid().is_mutable() == packInfo.targetGrid().is_mutable(),
+                    "Source and target grids must both be mutable or immutable");
 
         // Check features
         TORCH_CHECK_VALUE(inFeatures.is_contiguous(), "features must be contiguous");
-        TORCH_CHECK_TYPE(inFeatures.is_floating_point(), "features must have a floating point type");
-        TORCH_CHECK_VALUE(inFeatures.dim() == 2, std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
-                                                 std::to_string(inFeatures.dim()) + " dimensions");
+        TORCH_CHECK_TYPE(inFeatures.is_floating_point(),
+                         "features must have a floating point type");
+        TORCH_CHECK_VALUE(
+            inFeatures.dim() == 2,
+            std::string("Expected features to have 2 dimensions (shape (n, nF)) but got ") +
+                std::to_string(inFeatures.dim()) + " dimensions");
 
         // Check kernels
         TORCH_CHECK_TYPE(kernels.is_floating_point(), "kernels must have a floating point type");
         for (int i = 0; i < kernels.dim(); i += 1) {
-            TORCH_CHECK_VALUE(kernels.size(i) != 0, "kernels tensor has zero dimension (dim = " + std::to_string(i) + ")");
+            TORCH_CHECK_VALUE(kernels.size(i) != 0, "kernels tensor has zero dimension (dim = " +
+                                                        std::to_string(i) + ")");
         }
         // Check pack info
-        TORCH_CHECK(nbmaps.is_contiguous() && nbmaps.scalar_type() == torch::kInt32, "nbmaps must be contiguous");
-        TORCH_CHECK(nbsizes.is_contiguous() && nbsizes.scalar_type() == torch::kInt32, "nbsizes must be contiguous");
-
-        auto opt = torch::TensorOptions().dtype(torch::kInt32).device(inFeatures.device());
-        torch::Tensor kWidth = torch::empty({3,}, opt);
+        TORCH_CHECK(nbmaps.is_contiguous() && nbmaps.scalar_type() == torch::kInt32,
+                    "nbmaps must be contiguous");
+        TORCH_CHECK(nbsizes.is_contiguous() && nbsizes.scalar_type() == torch::kInt32,
+                    "nbsizes must be contiguous");
+
+        auto          opt = torch::TensorOptions().dtype(torch::kInt32).device(inFeatures.device());
+        torch::Tensor kWidth = torch::empty(
+            {
+                3,
+            },
+            opt);
         if (!transposed) {
-            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[0], "The number of input features must match the number of voxels");
-            TORCH_CHECK_VALUE(kernels.dim() == 5, std::string("Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
-                                        std::to_string(kernels.dim()) + " dimensions");
-            TORCH_CHECK_VALUE(kernels.size(1) == inFeatures.size(1),
-                            "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
-                            ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
+            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[0],
+                              "The number of input features must match the number of voxels");
+            TORCH_CHECK_VALUE(
+                kernels.dim() == 5,
+                std::string(
+                    "Expected kernels to have 5 dimensions (shape (out_ch, in_ch, d, h, w)) but got ") +
+                    std::to_string(kernels.dim()) + " dimensions");
+            TORCH_CHECK_VALUE(
+                kernels.size(1) == inFeatures.size(1),
+                "Expected input channels of kernels (" + std::to_string(kernels.size(1)) +
+                    ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
             const int outC = kernels.size(0), inC = kernels.size(1);
             kWidth[0] = kernels.size(2);
             kWidth[1] = kernels.size(3);
             kWidth[2] = kernels.size(4);
-            kernels = kernels.permute({4, 3, 2, 1, 0}).reshape({-1, inC, outC}).contiguous();
+            kernels   = kernels.permute({ 4, 3, 2, 1, 0 }).reshape({ -1, inC, outC }).contiguous();
         } else {
-            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[1], "The number of input features must match the number of voxels");
-            TORCH_CHECK_VALUE(kernels.dim() == 5, std::string("Expected kernels to have 5 dimensions (shape (in_ch, out_ch, d, h, w)) but got ") +
-                                        std::to_string(kernels.dim()) + " dimensions");
-            TORCH_CHECK_VALUE(kernels.size(0) == inFeatures.size(1),
-                            "Expected input channels of kernels (" + std::to_string(kernels.size(0)) +
-                            ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
+            TORCH_CHECK_VALUE(inFeatures.size(0) == sizes[1],
+                              "The number of input features must match the number of voxels");
+            TORCH_CHECK_VALUE(
+                kernels.dim() == 5,
+                std::string(
+                    "Expected kernels to have 5 dimensions (shape (in_ch, out_ch, d, h, w)) but got ") +
+                    std::to_string(kernels.dim()) + " dimensions");
+            TORCH_CHECK_VALUE(
+                kernels.size(0) == inFeatures.size(1),
+                "Expected input channels of kernels (" + std::to_string(kernels.size(0)) +
+                    ") to equal input channels of features: " + std::to_string(inFeatures.size(1)));
             const int inC = kernels.size(0), outC = kernels.size(1);
             kWidth[0] = kernels.size(2);
             kWidth[1] = kernels.size(3);
             kWidth[2] = kernels.size(4);
-            kernels = kernels.permute({4, 3, 2, 0, 1}).reshape({-1, inC, outC}).contiguous();
+            kernels   = kernels.permute({ 4, 3, 2, 0, 1 }).reshape({ -1, inC, outC }).contiguous();
         }
 
         // Save for backward
-        ctx->save_for_backward({inFeatures, kernels, nbmaps, nbsizes});
-        ctx->saved_data["transposed"] = transposed;
+        ctx->save_for_backward({ inFeatures, kernels, nbmaps, nbsizes });
+        ctx->saved_data["transposed"]   = transposed;
         ctx->saved_data["kernel_width"] = kWidth;
-        ctx->saved_data["use_me"] = packInfo.useME();
+        ctx->saved_data["use_me"]       = packInfo.useME();
 
         torch::Tensor output;
         if (packInfo.targetGrid().total_voxels() > 0) {
             auto opt = torch::TensorOptions().dtype(inFeatures.dtype()).device(inFeatures.device());
             if (!transposed) {
-                output = torch::zeros({sizes[1], kernels.size(-1)}, opt);
+                output = torch::zeros({ sizes[1], kernels.size(-1) }, opt);
             } else {
-                output = torch::zeros({sizes[0], kernels.size(-1)}, opt);
+                output = torch::zeros({ sizes[0], kernels.size(-1) }, opt);
             }
             // NOTE: Francis: We need .cpu().contiguous() here because we copied the convolution
             //       implementation from torch_sparse which runs std::max_element on a pointer
@@ -98,29 +118,30 @@ struct SparseConvolutionKernelMap : public torch::autograd::Function<SparseConvo
             // TODO: Francis: Fix torch_sparse conv to be robust
             FVDB_DISPATCH_KERNEL_DEVICE(inFeatures.device(), [&]() {
                 ops::dispatchSparseConvolutionKernelMap<DeviceTag>(
-                        inFeatures, output, kernels, nbmaps,
-                        nbsizes.cpu().contiguous(), transposed, middleAcceleration);
+                    inFeatures, output, kernels, nbmaps, nbsizes.cpu().contiguous(), transposed,
+                    middleAcceleration);
             });
         } else {
             auto opt = torch::TensorOptions().dtype(inFeatures.dtype()).device(inFeatures.device());
-            output = torch::empty({0, kernels.size(-1)}, opt);
+            output   = torch::empty({ 0, kernels.size(-1) }, opt);
         }
 
-        return {output};
+        return { output };
     }
 
-    static variable_list backward(AutogradContext *ctx, variable_list grad_output) {
+    static variable_list
+    backward(AutogradContext *ctx, variable_list grad_output) {
         // Use data saved in forward
-        variable_list saved = ctx->get_saved_variables();
-        Variable inFeatures = saved.at(0);
-        Variable kernels = saved.at(1);
-        Variable nbmaps = saved.at(2);
-        Variable nbsizes = saved.at(3);
-        bool transposed = ctx->saved_data["transposed"].toBool();
-        torch::Tensor kWidth = ctx->saved_data["kernel_width"].toTensor();
-        bool use_me = ctx->saved_data["use_me"].toBool();
-
-        torch::Tensor gradInput = torch::zeros_like(inFeatures);
+        variable_list saved      = ctx->get_saved_variables();
+        Variable      inFeatures = saved.at(0);
+        Variable      kernels    = saved.at(1);
+        Variable      nbmaps     = saved.at(2);
+        Variable      nbsizes    = saved.at(3);
+        bool          transposed = ctx->saved_data["transposed"].toBool();
+        torch::Tensor kWidth     = ctx->saved_data["kernel_width"].toTensor();
+        bool          use_me     = ctx->saved_data["use_me"].toBool();
+
+        torch::Tensor gradInput  = torch::zeros_like(inFeatures);
         torch::Tensor gradWeight = torch::zeros_like(kernels);
 
         Variable gradOut = grad_output.at(0);
@@ -141,14 +162,23 @@ struct SparseConvolutionKernelMap : public torch::autograd::Function<SparseConvo
 
         const int outC = gradWeight.size(-1), inC = gradWeight.size(-2);
         if (!transposed) {
-            gradWeight = gradWeight.reshape({kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(), kWidth[0].item<int32_t>(), inC, outC}).permute({4, 3, 2, 1, 0});
+            gradWeight = gradWeight
+                             .reshape({ kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(),
+                                        kWidth[0].item<int32_t>(), inC, outC })
+                             .permute({ 4, 3, 2, 1, 0 });
         } else {
-            gradWeight = gradWeight.reshape({kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(), kWidth[0].item<int32_t>(), inC, outC}).permute({3, 4, 2, 1, 0});
+            gradWeight = gradWeight
+                             .reshape({ kWidth[2].item<int32_t>(), kWidth[1].item<int32_t>(),
+                                        kWidth[0].item<int32_t>(), inC, outC })
+                             .permute({ 3, 4, 2, 1, 0 });
         }
-        return {gradInput, gradWeight, torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor()};
+        return { gradInput,       gradWeight,      torch::Tensor(),
+                 torch::Tensor(), torch::Tensor(), torch::Tensor() };
     }
 };
 
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_SPARSECONVOLUTIONKERNELMAP_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/SplatIntoGrid.cpp b/fvdb/src/detail/autograd/SplatIntoGrid.cpp
index c73cd277a3..10b101de03 100644
--- a/fvdb/src/detail/autograd/SplatIntoGrid.cpp
+++ b/fvdb/src/detail/autograd/SplatIntoGrid.cpp
@@ -3,42 +3,45 @@
 //
 #include "SplatIntoGrid.h"
 
-#include "detail/ops/Ops.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
-#include "detail/utils/Utils.h"
-
-
-void checkForwardInputs(c10::intrusive_ptr<fvdb::detail::GridBatchImpl> grid,
-                        fvdb::detail::autograd::SplatIntoGridTrilinear::JaggedVariable points,
-                        fvdb::detail::autograd::SplatIntoGridTrilinear::Variable data) {
+void
+checkForwardInputs(c10::intrusive_ptr<fvdb::detail::GridBatchImpl>                grid,
+                   fvdb::detail::autograd::SplatIntoGridTrilinear::JaggedVariable points,
+                   fvdb::detail::autograd::SplatIntoGridTrilinear::Variable       data) {
     grid->checkNonEmptyGrid();
-    TORCH_CHECK_VALUE(points.device() == data.device(), "points and data must be on the same device");
+    TORCH_CHECK_VALUE(points.device() == data.device(),
+                      "points and data must be on the same device");
     grid->checkDevice(points);
     grid->checkDevice(data);
     points.check_valid();
 
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
     TORCH_CHECK_TYPE(points.dtype() == data.dtype(), "all tensors must have the same type");
-    TORCH_CHECK_VALUE(points.rdim() == 2, "Expected points to have shape [B*M, 3] (wrong number of dimensions)");
+    TORCH_CHECK_VALUE(points.rdim() == 2,
+                      "Expected points to have shape [B*M, 3] (wrong number of dimensions)");
     TORCH_CHECK(points.numel() > 0, "Empty tensor (points)");
     TORCH_CHECK(points.rsize(1) == 3, "points must have shape [B*M, 3] (points must be 3D)");
 
     TORCH_CHECK_TYPE(data.is_floating_point(), "point_data must have a floating point type");
-    TORCH_CHECK_VALUE(data.dim() >= 2, "Expected data to have shape [B*M, *] (at least 3 dimensions)");
+    TORCH_CHECK_VALUE(data.dim() >= 2,
+                      "Expected data to have shape [B*M, *] (at least 3 dimensions)");
     TORCH_CHECK(data.numel() > 0, "Empty tensor (data)");
-    TORCH_CHECK(data.size(0) == points.rsize(0), "point_data must have one value per point (shape [B*M, *]) (incorrect first dimension must match number of points)");
+    TORCH_CHECK(
+        data.size(0) == points.rsize(0),
+        "point_data must have one value per point (shape [B*M, *]) (incorrect first dimension must match number of points)");
 }
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-
-SplatIntoGridTrilinear::variable_list SplatIntoGridTrilinear::forward(SplatIntoGridTrilinear::AutogradContext *ctx,
-                                                                      c10::intrusive_ptr<GridBatchImpl> grid,
-                                                                      SplatIntoGridTrilinear::JaggedVariable points,
-                                                                      SplatIntoGridTrilinear::Variable pointData) {
-
+SplatIntoGridTrilinear::variable_list
+SplatIntoGridTrilinear::forward(SplatIntoGridTrilinear::AutogradContext *ctx,
+                                c10::intrusive_ptr<GridBatchImpl>        grid,
+                                SplatIntoGridTrilinear::JaggedVariable   points,
+                                SplatIntoGridTrilinear::Variable         pointData) {
     checkForwardInputs(grid, points, pointData);
 
     torch::Tensor outGridData = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
@@ -46,42 +49,41 @@ SplatIntoGridTrilinear::variable_list SplatIntoGridTrilinear::forward(SplatIntoG
     });
 
     // Save data for backward in context
-    ctx->save_for_backward({pointData, points.jdata(), points.joffsets(), points.jlidx()});
+    ctx->save_for_backward({ pointData, points.jdata(), points.joffsets(), points.jlidx() });
     ctx->saved_data["grid"] = grid;
     // int64_t numOutputValues = grid->totalVoxels();
 
-    return variable_list({outGridData});
+    return variable_list({ outGridData });
 }
 
-SplatIntoGridTrilinear::variable_list SplatIntoGridTrilinear::backward(SplatIntoGridTrilinear::AutogradContext *ctx,
-                                                                       SplatIntoGridTrilinear::variable_list grad_output) {
-
+SplatIntoGridTrilinear::variable_list
+SplatIntoGridTrilinear::backward(SplatIntoGridTrilinear::AutogradContext *ctx,
+                                 SplatIntoGridTrilinear::variable_list    grad_output) {
     // Use data saved in forward
-    variable_list saved = ctx->get_saved_variables();
-    Variable pointData = saved.at(0);  // [B*M, *]
+    variable_list saved     = ctx->get_saved_variables();
+    Variable      pointData = saved.at(0);      // [B*M, *]
 
-    Variable pointCoords = saved.at(1);  // [B*M, 3]
-    Variable pointJOffsets = saved.at(2);  // [B,]
-    Variable pointsJLidx = saved.at(3);  // [B,]
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    Variable gradOut = grad_output.at(0);  // [N, *]
+    Variable pointCoords   = saved.at(1);       // [B*M, 3]
+    Variable pointJOffsets = saved.at(2);       // [B,]
+    Variable pointsJLidx   = saved.at(3);       // [B,]
+    auto     grid          = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    Variable gradOut       = grad_output.at(0); // [N, *]
 
     auto ret = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         return ops::dispatchSampleGridTrilinear<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), gradOut);
+            *grid,
+            JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx),
+            gradOut);
     });
 
-    return {torch::Tensor(), torch::Tensor(), ret[0]};
+    return { torch::Tensor(), torch::Tensor(), ret[0] };
 }
 
-
-
-
-SplatIntoGridBezier::variable_list SplatIntoGridBezier::forward(SplatIntoGridBezier::AutogradContext *ctx,
-                                                                c10::intrusive_ptr<GridBatchImpl> grid,
-                                                                SplatIntoGridBezier::JaggedVariable points,
-                                                                SplatIntoGridBezier::Variable pointData) {
-
+SplatIntoGridBezier::variable_list
+SplatIntoGridBezier::forward(SplatIntoGridBezier::AutogradContext *ctx,
+                             c10::intrusive_ptr<GridBatchImpl>     grid,
+                             SplatIntoGridBezier::JaggedVariable   points,
+                             SplatIntoGridBezier::Variable         pointData) {
     checkForwardInputs(grid, points, pointData);
 
     torch::Tensor outGridData = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
@@ -89,32 +91,34 @@ SplatIntoGridBezier::variable_list SplatIntoGridBezier::forward(SplatIntoGridBez
     });
 
     // Save data for backward in context
-    ctx->save_for_backward({pointData, points.jdata(), points.joffsets(), points.jlidx()});
+    ctx->save_for_backward({ pointData, points.jdata(), points.joffsets(), points.jlidx() });
     ctx->saved_data["grid"] = grid;
 
-    return variable_list({outGridData});
+    return variable_list({ outGridData });
 }
 
-SplatIntoGridBezier::variable_list SplatIntoGridBezier::backward(SplatIntoGridBezier::AutogradContext *ctx,
-                                                                 SplatIntoGridBezier::variable_list grad_output) {
-
+SplatIntoGridBezier::variable_list
+SplatIntoGridBezier::backward(SplatIntoGridBezier::AutogradContext *ctx,
+                              SplatIntoGridBezier::variable_list    grad_output) {
     // Use data saved in forward
-    variable_list saved = ctx->get_saved_variables();
-    Variable pointData = saved.at(0);  // [B*M, *]
+    variable_list saved     = ctx->get_saved_variables();
+    Variable      pointData = saved.at(0); // [B*M, *]
 
-    Variable pointCoords = saved.at(1);  // [B*M, 3]
+    Variable pointCoords   = saved.at(1);  // [B*M, 3]
     Variable pointJOffsets = saved.at(2);  // [B,]
-    Variable pointsJLidx = saved.at(3);  // [B,]
+    Variable pointsJLidx   = saved.at(3);  // [B,]
 
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    auto     grid    = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
     Variable gradOut = grad_output.at(0);  // [N, *]
 
     torch::Tensor outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
         return ops::dispatchSampleGridBezier<DeviceTag>(
-            *grid, JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx), gradOut)[0];
+            *grid,
+            JaggedTensor::from_data_offsets_and_list_ids(pointCoords, pointJOffsets, pointsJLidx),
+            gradOut)[0];
     });
 
-    return {torch::Tensor(), torch::Tensor(), outGradIn};
+    return { torch::Tensor(), torch::Tensor(), outGradIn };
 }
 
 } // namespace autograd
diff --git a/fvdb/src/detail/autograd/SplatIntoGrid.h b/fvdb/src/detail/autograd/SplatIntoGrid.h
index a0e183178e..7074f28031 100644
--- a/fvdb/src/detail/autograd/SplatIntoGrid.h
+++ b/fvdb/src/detail/autograd/SplatIntoGrid.h
@@ -1,48 +1,43 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_SPLATINTOGRID_H
+#define FVDB_DETAIL_AUTOGRAD_SPLATINTOGRID_H
 
 #include <torch/autograd.h>
 
 #include "detail/GridBatchImpl.h"
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct SplatIntoGridTrilinear : public torch::autograd::Function<SplatIntoGridTrilinear> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-    using JaggedVariable = JaggedTensor;
+    using Variable        = torch::autograd::Variable;
+    using JaggedVariable  = JaggedTensor;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 JaggedTensor points,
-                                 Variable pointData);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 JaggedTensor points, Variable pointData);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
-
 struct SplatIntoGridBezier : public torch::autograd::Function<SplatIntoGridBezier> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-    using JaggedVariable = JaggedTensor;
+    using Variable        = torch::autograd::Variable;
+    using JaggedVariable  = JaggedTensor;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 JaggedVariable points,
-                                 Variable pointData);
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 JaggedVariable points, Variable pointData);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_SPLATINTOGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/TransformPoints.cpp b/fvdb/src/detail/autograd/TransformPoints.cpp
index 90ca2713ea..762936faa4 100644
--- a/fvdb/src/detail/autograd/TransformPoints.cpp
+++ b/fvdb/src/detail/autograd/TransformPoints.cpp
@@ -3,27 +3,22 @@
 //
 #include "TransformPoints.h"
 
-#include <vector>
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
-
-
+#include <vector>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-
-TransformPoints::variable_list TransformPoints::forward(TransformPoints::AutogradContext *ctx,
-                                                        c10::intrusive_ptr<GridBatchImpl> grid,
-                                                        TransformPoints::JaggedVariable points,
-                                                        Variable pointsData,
-                                                        bool isInverse,
-                                                        bool isDual) {
-
+TransformPoints::variable_list
+TransformPoints::forward(TransformPoints::AutogradContext *ctx,
+                         c10::intrusive_ptr<GridBatchImpl> grid,
+                         TransformPoints::JaggedVariable points, Variable pointsData,
+                         bool isInverse, bool isDual) {
     grid->checkDevice(points);
     TORCH_CHECK_VALUE(points.rdim() == 2, "points must have shape [B*N, 3]");
     TORCH_CHECK_VALUE(points.rsize(-1) == 3, "points must have shape [B*N, 3]");
@@ -36,58 +31,59 @@ TransformPoints::variable_list TransformPoints::forward(TransformPoints::Autogra
     torch::Tensor outTxPoints;
     if (isInverse) {
         outTxPoints = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
-            return ops::dispatchInvTransformPointsToGrid<DeviceTag>(
-                *grid, pointsWrap, !isDual);
+            return ops::dispatchInvTransformPointsToGrid<DeviceTag>(*grid, pointsWrap, !isDual);
         });
     } else {
         outTxPoints = FVDB_DISPATCH_KERNEL_DEVICE(points.device(), [&]() {
-            return ops::dispatchTransformPointsToGrid<DeviceTag>(
-                *grid, pointsWrap, !isDual);
+            return ops::dispatchTransformPointsToGrid<DeviceTag>(*grid, pointsWrap, !isDual);
         });
     }
 
-    ctx->save_for_backward({points.joffsets(), points.jlidx()});
+    ctx->save_for_backward({ points.joffsets(), points.jlidx() });
 
-    ctx->saved_data["grid"] = grid;
-    ctx->saved_data["isDual"] = isDual;
+    ctx->saved_data["grid"]      = grid;
+    ctx->saved_data["isDual"]    = isDual;
     ctx->saved_data["isInverse"] = isInverse;
 
-    return {outTxPoints};  // [B*N, 3]
+    return { outTxPoints }; // [B*N, 3]
 }
 
-
-TransformPoints::variable_list TransformPoints::backward(TransformPoints::AutogradContext *ctx,
-                                                         TransformPoints::variable_list grad_output) {
-
+TransformPoints::variable_list
+TransformPoints::backward(TransformPoints::AutogradContext *ctx,
+                          TransformPoints::variable_list    grad_output) {
     variable_list saved = ctx->get_saved_variables();
 
     Variable pointsJOffsets = saved.at(0);
-    Variable pointsJLidx = saved.at(1);
-    Variable gradOut = grad_output.at(0);  // [B*N, 3]
+    Variable pointsJLidx    = saved.at(1);
+    Variable gradOut        = grad_output.at(0); // [B*N, 3]
 
     // Use data saved in forward
-    auto grid = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
-    const bool isDual = ctx->saved_data["isDual"].toBool();
+    auto       grid      = ctx->saved_data["grid"].toCustomClass<GridBatchImpl>();
+    const bool isDual    = ctx->saved_data["isDual"].toBool();
     const bool isInverse = ctx->saved_data["isInverse"].toBool();
 
     Variable outGradIn; // = torch::empty_like(gradOut);  // [B*N, 3]
     if (isInverse) {
         outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
             return ops::dispatchInvTransformPointsToGridBackward<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(gradOut, pointsJOffsets, pointsJLidx), !isDual);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(gradOut, pointsJOffsets, pointsJLidx),
+                !isDual);
         });
     } else {
         outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(gradOut.device(), [&]() {
             return ops::dispatchTransformPointsToGridBackward<DeviceTag>(
-                *grid, JaggedTensor::from_data_offsets_and_list_ids(gradOut, pointsJOffsets, pointsJLidx), !isDual);
+                *grid,
+                JaggedTensor::from_data_offsets_and_list_ids(gradOut, pointsJOffsets, pointsJLidx),
+                !isDual);
         });
     }
 
-    // Variable outGradIn = outGradInReshape.reshape(getShapeButReplaceFirstDim(fineData.size(0), gradOut));
-    return {torch::Tensor(), torch::Tensor(), outGradIn, torch::Tensor(), torch::Tensor()};
+    // Variable outGradIn = outGradInReshape.reshape(getShapeButReplaceFirstDim(fineData.size(0),
+    // gradOut));
+    return { torch::Tensor(), torch::Tensor(), outGradIn, torch::Tensor(), torch::Tensor() };
 }
 
-
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/TransformPoints.h b/fvdb/src/detail/autograd/TransformPoints.h
index 8d8d166b4b..47302e9085 100644
--- a/fvdb/src/detail/autograd/TransformPoints.h
+++ b/fvdb/src/detail/autograd/TransformPoints.h
@@ -1,34 +1,32 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_TRANSFORMPOINTS_H
+#define FVDB_DETAIL_AUTOGRAD_TRANSFORMPOINTS_H
 
 #include <torch/autograd.h>
 
 #include "detail/GridBatchImpl.h"
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct TransformPoints : public torch::autograd::Function<TransformPoints> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
-    using JaggedVariable = JaggedTensor;
-
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> grid,
-                                 JaggedVariable points,
-                                 Variable pointsData,
-                                 bool isInverse,
+    using Variable        = torch::autograd::Variable;
+    using JaggedVariable  = JaggedTensor;
+
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> grid,
+                                 JaggedVariable points, Variable pointsData, bool isInverse,
                                  bool isDual);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_TRANSFORMPOINTS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/UpsampleGrid.cpp b/fvdb/src/detail/autograd/UpsampleGrid.cpp
index 767e42176a..9d1f60037e 100644
--- a/fvdb/src/detail/autograd/UpsampleGrid.cpp
+++ b/fvdb/src/detail/autograd/UpsampleGrid.cpp
@@ -3,74 +3,70 @@
 //
 #include "UpsampleGrid.h"
 
-#include <vector>
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <vector>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-UpsampleGrid::variable_list UpsampleGrid::forward(UpsampleGrid::AutogradContext *ctx,
-                                                  c10::intrusive_ptr<GridBatchImpl> coarseGrid,
-                                                  c10::intrusive_ptr<GridBatchImpl> fineGrid,
-                                                  nanovdb::Coord upsamplingFactor,
-                                                  UpsampleGrid::Variable coarseData) {
+UpsampleGrid::variable_list
+UpsampleGrid::forward(UpsampleGrid::AutogradContext    *ctx,
+                      c10::intrusive_ptr<GridBatchImpl> coarseGrid,
+                      c10::intrusive_ptr<GridBatchImpl> fineGrid, nanovdb::Coord upsamplingFactor,
+                      UpsampleGrid::Variable coarseData) {
     // Save data for backward in context
-    ctx->save_for_backward({coarseData});
+    ctx->save_for_backward({ coarseData });
 
-    ctx->saved_data["coarse_grid"] = coarseGrid;
-    ctx->saved_data["fine_grid"] = fineGrid;
-    ctx->saved_data["upsampling_factor_x"] = (int64_t) upsamplingFactor[0];
-    ctx->saved_data["upsampling_factor_y"] = (int64_t) upsamplingFactor[1];
-    ctx->saved_data["upsampling_factor_z"] = (int64_t) upsamplingFactor[2];
+    ctx->saved_data["coarse_grid"]         = coarseGrid;
+    ctx->saved_data["fine_grid"]           = fineGrid;
+    ctx->saved_data["upsampling_factor_x"] = (int64_t)upsamplingFactor[0];
+    ctx->saved_data["upsampling_factor_y"] = (int64_t)upsamplingFactor[1];
+    ctx->saved_data["upsampling_factor_z"] = (int64_t)upsamplingFactor[2];
 
     if (fineGrid->totalVoxels() == 0) {
-        return variable_list({torch::empty({0, coarseData.size(1)}, coarseData.options())});
+        return variable_list({ torch::empty({ 0, coarseData.size(1) }, coarseData.options()) });
     }
 
     torch::Tensor ret = FVDB_DISPATCH_KERNEL_DEVICE(coarseData.device(), [&]() {
-        return ops::dispatchUpsampleGridNearest<DeviceTag>(
-            *coarseGrid, *fineGrid, coarseData, upsamplingFactor);
+        return ops::dispatchUpsampleGridNearest<DeviceTag>(*coarseGrid, *fineGrid, coarseData,
+                                                           upsamplingFactor);
     });
-    return variable_list({ret});
+    return variable_list({ ret });
 }
 
-UpsampleGrid::variable_list UpsampleGrid::backward(UpsampleGrid::AutogradContext *ctx,
-                                                   UpsampleGrid::variable_list grad_output) {
-
+UpsampleGrid::variable_list
+UpsampleGrid::backward(UpsampleGrid::AutogradContext *ctx,
+                       UpsampleGrid::variable_list    grad_output) {
     // // Use data saved in forward
-    variable_list saved = ctx->get_saved_variables();
-    Variable coarseData = saved.at(0);
+    variable_list saved      = ctx->get_saved_variables();
+    Variable      coarseData = saved.at(0);
 
-    auto fineGrid = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
-    auto coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
-    const int64_t upsamplingFactorX = ctx->saved_data["upsampling_factor_x"].toInt();
-    const int64_t upsamplingFactorY = ctx->saved_data["upsampling_factor_y"].toInt();
-    const int64_t upsamplingFactorZ = ctx->saved_data["upsampling_factor_z"].toInt();
+    auto                 fineGrid   = ctx->saved_data["fine_grid"].toCustomClass<GridBatchImpl>();
+    auto                 coarseGrid = ctx->saved_data["coarse_grid"].toCustomClass<GridBatchImpl>();
+    const int64_t        upsamplingFactorX = ctx->saved_data["upsampling_factor_x"].toInt();
+    const int64_t        upsamplingFactorY = ctx->saved_data["upsampling_factor_y"].toInt();
+    const int64_t        upsamplingFactorZ = ctx->saved_data["upsampling_factor_z"].toInt();
     const nanovdb::Coord upsamplingFactor(upsamplingFactorX, upsamplingFactorY, upsamplingFactorZ);
 
-    Variable gradOut = grad_output.at(0);  // [#fine_voxels, *]
+    Variable gradOut = grad_output.at(0); // [#fine_voxels, *]
     if (fineGrid->totalVoxels() == 0) {
         auto ret = torch::zeros_like(coarseData);
-        return {torch::Tensor(), torch::Tensor(), torch::Tensor(), ret};
+        return { torch::Tensor(), torch::Tensor(), torch::Tensor(), ret };
     }
 
     torch::Tensor outGradIn = FVDB_DISPATCH_KERNEL_DEVICE(coarseData.device(), [&]() {
-        return ops::dispatchUpsampleGridNearestBackward<DeviceTag>(
-            *fineGrid, *coarseGrid,
-            gradOut,
-            coarseData,
-            upsamplingFactor
-        );
+        return ops::dispatchUpsampleGridNearestBackward<DeviceTag>(*fineGrid, *coarseGrid, gradOut,
+                                                                   coarseData, upsamplingFactor);
     });
 
-    return {torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn};
+    return { torch::Tensor(), torch::Tensor(), torch::Tensor(), outGradIn };
 }
 
-}  // namespace autograd
-}  // namespace detail
-}  // namespace fvdb
+} // namespace autograd
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/autograd/UpsampleGrid.h b/fvdb/src/detail/autograd/UpsampleGrid.h
index 5457f0e063..5d3cf84707 100644
--- a/fvdb/src/detail/autograd/UpsampleGrid.h
+++ b/fvdb/src/detail/autograd/UpsampleGrid.h
@@ -1,31 +1,31 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_UPSAMPLEGRID_H
+#define FVDB_DETAIL_AUTOGRAD_UPSAMPLEGRID_H
+
 #include <torch/autograd.h>
 
 #include "detail/GridBatchImpl.h"
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
 struct UpsampleGrid : public torch::autograd::Function<UpsampleGrid> {
-    using variable_list = torch::autograd::variable_list;
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 c10::intrusive_ptr<GridBatchImpl> coarseGrid,
+    static variable_list forward(AutogradContext *ctx, c10::intrusive_ptr<GridBatchImpl> coarseGrid,
                                  c10::intrusive_ptr<GridBatchImpl> fineGrid,
-                                 nanovdb::Coord upsamplingFactor,
-                                 Variable coarseData);
+                                 nanovdb::Coord upsamplingFactor, Variable coarseData);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_UPSAMPLEGRID_H
\ No newline at end of file
diff --git a/fvdb/src/detail/autograd/VolumeRender.cpp b/fvdb/src/detail/autograd/VolumeRender.cpp
index c78b37f8e4..341ddf07f8 100644
--- a/fvdb/src/detail/autograd/VolumeRender.cpp
+++ b/fvdb/src/detail/autograd/VolumeRender.cpp
@@ -3,22 +3,20 @@
 //
 #include "VolumeRender.h"
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-VolumeRender::variable_list VolumeRender::forward(VolumeRender::AutogradContext *ctx,
-                                                  const VolumeRender::Variable& sigmas,
-                                                  const VolumeRender::Variable& rgbs,
-                                                  const VolumeRender::Variable& deltaTs,
-                                                  const VolumeRender::Variable& ts,
-                                                  const VolumeRender::Variable& jOffsets,
-                                                  double tsmtThreshold) {
+VolumeRender::variable_list
+VolumeRender::forward(VolumeRender::AutogradContext *ctx, const VolumeRender::Variable &sigmas,
+                      const VolumeRender::Variable &rgbs, const VolumeRender::Variable &deltaTs,
+                      const VolumeRender::Variable &ts, const VolumeRender::Variable &jOffsets,
+                      double tsmtThreshold) {
     const int numRays = jOffsets.size(0) - 1;
-    const int N = sigmas.size(0);
+    const int N       = sigmas.size(0);
 
     TORCH_CHECK(jOffsets.dim() == 1, "jOffsets must have shape (nRays+1,)");
     TORCH_CHECK(sigmas.dim() == 1, "sigmas must have shape (nRays*nSamplesPerRay,)");
@@ -31,76 +29,82 @@ VolumeRender::variable_list VolumeRender::forward(VolumeRender::AutogradContext
     TORCH_CHECK(sigmas.device() == ts.device(), "All tensors must be on the same device");
     TORCH_CHECK(sigmas.device() == jOffsets.device(), "All tensors must be on the same device");
 
-    TORCH_CHECK(sigmas.dtype() == rgbs.dtype(), "All floating point tensors must be on the same dtype");
-    TORCH_CHECK(sigmas.dtype() == deltaTs.dtype(), "All floating point tensors must be on the same dtype");
-    TORCH_CHECK(sigmas.dtype() == ts.dtype(),"All floating point tensors must be on the same dtype");
-    TORCH_CHECK(jOffsets.dtype() == torch::dtype(JOffsetsScalarType).dtype(), "jOffsets must be of type torch.int32");
-
-    TORCH_CHECK(sigmas.size(0) == rgbs.size(0), "sigmas and rgbs must have the same number of elements");
-    TORCH_CHECK(sigmas.size(0) == deltaTs.size(0), "sigmas and deltaTs must have the same number of elements");
-    TORCH_CHECK(sigmas.size(0) == ts.size(0), "sigmas and ts must have the same number of elements");
-    torch::Tensor outOpacity = torch::zeros({numRays}, sigmas.options());
-    torch::Tensor outDepth = torch::zeros({numRays}, sigmas.options());
+    TORCH_CHECK(sigmas.dtype() == rgbs.dtype(),
+                "All floating point tensors must be on the same dtype");
+    TORCH_CHECK(sigmas.dtype() == deltaTs.dtype(),
+                "All floating point tensors must be on the same dtype");
+    TORCH_CHECK(sigmas.dtype() == ts.dtype(),
+                "All floating point tensors must be on the same dtype");
+    TORCH_CHECK(jOffsets.dtype() == torch::dtype(JOffsetsScalarType).dtype(),
+                "jOffsets must be of type torch.int32");
+
+    TORCH_CHECK(sigmas.size(0) == rgbs.size(0),
+                "sigmas and rgbs must have the same number of elements");
+    TORCH_CHECK(sigmas.size(0) == deltaTs.size(0),
+                "sigmas and deltaTs must have the same number of elements");
+    TORCH_CHECK(sigmas.size(0) == ts.size(0),
+                "sigmas and ts must have the same number of elements");
+    torch::Tensor outOpacity = torch::zeros({ numRays }, sigmas.options());
+    torch::Tensor outDepth   = torch::zeros({ numRays }, sigmas.options());
     // torch::Tensor outDepthSq = torch::zeros({numRays}, sigmas.options());
-    torch::Tensor outRgb = torch::zeros({numRays, 3}, sigmas.options());
-    torch::Tensor outWs = torch::zeros({N}, sigmas.options());
-    torch::Tensor outTotalSamples = torch::zeros({numRays}, torch::dtype(torch::kLong).device(sigmas.device()));
+    torch::Tensor outRgb = torch::zeros({ numRays, 3 }, sigmas.options());
+    torch::Tensor outWs  = torch::zeros({ N }, sigmas.options());
+    torch::Tensor outTotalSamples =
+        torch::zeros({ numRays }, torch::dtype(torch::kLong).device(sigmas.device()));
 
     FVDB_DISPATCH_KERNEL_DEVICE(sigmas.device(), [&]() {
-        ops::dispatchVolumeRender<DeviceTag>(
-            sigmas, rgbs, deltaTs, ts, jOffsets, tsmtThreshold,
-            outOpacity, outDepth, outRgb, outWs, outTotalSamples);
+        ops::dispatchVolumeRender<DeviceTag>(sigmas, rgbs, deltaTs, ts, jOffsets, tsmtThreshold,
+                                             outOpacity, outDepth, outRgb, outWs, outTotalSamples);
     });
 
     ctx->saved_data["tsmtThreshold"] = tsmtThreshold;
 
-    ctx->save_for_backward({
-        sigmas, rgbs, deltaTs, ts, jOffsets,
-        outOpacity, outDepth, outRgb, outWs
-    });
+    ctx->save_for_backward(
+        { sigmas, rgbs, deltaTs, ts, jOffsets, outOpacity, outDepth, outRgb, outWs });
 
     return { outRgb, outDepth, outOpacity, outWs, outTotalSamples };
 }
 
-VolumeRender::variable_list VolumeRender::backward(VolumeRender::AutogradContext *ctx,
-                                                   VolumeRender::variable_list grad_output) {
-    Variable dLdRgb = grad_output.at(0);
-    Variable dLdDepth = grad_output.at(1);
+VolumeRender::variable_list
+VolumeRender::backward(VolumeRender::AutogradContext *ctx,
+                       VolumeRender::variable_list    grad_output) {
+    Variable dLdRgb     = grad_output.at(0);
+    Variable dLdDepth   = grad_output.at(1);
     Variable dLdOpacity = grad_output.at(2);
-    Variable dLdWs = grad_output.at(3);
+    Variable dLdWs      = grad_output.at(3);
     // Variable dLdDepthSq = grad_output.at(3);
 
-    variable_list saved = ctx->get_saved_variables();
-    Variable sigmas = saved.at(0);
-    Variable rgbs = saved.at(1);
-    Variable deltaTs = saved.at(2);
-    Variable ts = saved.at(3);
-    Variable jOffsets = saved.at(4);
+    variable_list saved    = ctx->get_saved_variables();
+    Variable      sigmas   = saved.at(0);
+    Variable      rgbs     = saved.at(1);
+    Variable      deltaTs  = saved.at(2);
+    Variable      ts       = saved.at(3);
+    Variable      jOffsets = saved.at(4);
 
     Variable outOpacity = saved.at(5);
-    Variable outDepth = saved.at(6);
+    Variable outDepth   = saved.at(6);
     // Variable outDepthSq = ctx->saved_data["outDepthSq"].toTensor();
-    Variable outRgb = saved.at(7);
-    Variable outWs = saved.at(8);
+    Variable     outRgb        = saved.at(7);
+    Variable     outWs         = saved.at(8);
     const double tsmtThreshold = ctx->saved_data["tsmtThreshold"].toDouble();
 
     const int N = sigmas.size(0);
 
-    Variable dLdSigmas = torch::zeros({N}, sigmas.options());
-    Variable dLdRgbs = torch::zeros({N, 3}, sigmas.options());
+    Variable dLdSigmas = torch::zeros({ N }, sigmas.options());
+    Variable dLdRgbs   = torch::zeros({ N, 3 }, sigmas.options());
 
     FVDB_DISPATCH_KERNEL_DEVICE(sigmas.device(), [&]() {
         ops::dispatchVolumeRenderBackward<DeviceTag>(
-            dLdOpacity, dLdDepth, /*dLdDepthSq,*/ dLdRgb, dLdWs,
-            sigmas, rgbs, outWs, deltaTs, ts, jOffsets,
-            outOpacity, outDepth, /*outDepthSq,*/ outRgb, tsmtThreshold,
-            dLdSigmas, dLdRgbs);
+            dLdOpacity, dLdDepth, /*dLdDepthSq,*/ dLdRgb, dLdWs, sigmas, rgbs, outWs, deltaTs, ts,
+            jOffsets, outOpacity, outDepth, /*outDepthSq,*/ outRgb, tsmtThreshold, dLdSigmas,
+            dLdRgbs);
     });
 
-    return { dLdSigmas, dLdRgbs, torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor() };
+    return {
+        dLdSigmas, dLdRgbs, torch::Tensor(), torch::Tensor(), torch::Tensor(), torch::Tensor()
+    };
 }
 
-
 } // namespace autograd
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/autograd/VolumeRender.h b/fvdb/src/detail/autograd/VolumeRender.h
index 010dff48ab..8c20722435 100644
--- a/fvdb/src/detail/autograd/VolumeRender.h
+++ b/fvdb/src/detail/autograd/VolumeRender.h
@@ -1,33 +1,29 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_AUTOGRAD_VOLUMERENDER_H
+#define FVDB_DETAIL_AUTOGRAD_VOLUMERENDER_H
 
 #include <torch/autograd.h>
 
-
 namespace fvdb {
 namespace detail {
 namespace autograd {
 
-struct VolumeRender : public torch::autograd::Function<VolumeRender>
-{
-    using variable_list = torch::autograd::variable_list;
+struct VolumeRender : public torch::autograd::Function<VolumeRender> {
+    using variable_list   = torch::autograd::variable_list;
     using AutogradContext = torch::autograd::AutogradContext;
-    using Variable = torch::autograd::Variable;
+    using Variable        = torch::autograd::Variable;
 
-    static variable_list forward(AutogradContext *ctx,
-                                 const Variable& sigmas,
-                                 const Variable& rgbs,
-                                 const Variable& deltaTs,
-                                 const Variable& ts,
-                                 const Variable& raysAcc,
-                                 double tsmtThreshold);
+    static variable_list forward(AutogradContext *ctx, const Variable &sigmas, const Variable &rgbs,
+                                 const Variable &deltaTs, const Variable &ts,
+                                 const Variable &raysAcc, double tsmtThreshold);
 
-    static variable_list backward(AutogradContext *ctx,
-                                  variable_list grad_output);
+    static variable_list backward(AutogradContext *ctx, variable_list grad_output);
 };
 
 } // namespace autograd
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_AUTOGRAD_VOLUMERENDER_H
\ No newline at end of file
diff --git a/fvdb/src/detail/build/Build.h b/fvdb/src/detail/build/Build.h
index f119f242ec..38a323e4c0 100644
--- a/fvdb/src/detail/build/Build.h
+++ b/fvdb/src/detail/build/Build.h
@@ -1,15 +1,14 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_BUILD_BUILD_H
+#define FVDB_DETAIL_BUILD_BUILD_H
 
-#include <torch/all.h>
-
-#include "detail/VoxelCoordTransform.h"
-#include "detail/GridBatchImpl.h"
-
-#include "detail/utils/Utils.h"
+#include <detail/GridBatchImpl.h>
+#include <detail/VoxelCoordTransform.h>
+#include <detail/utils/Utils.h>
 
+#include <torch/all.h>
 
 namespace fvdb {
 namespace detail {
@@ -27,37 +26,42 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildEmptyGrid(torch::Device device, bool
 /// @param batchSize The number of grids in the batch
 /// @param size The size of the grid in voxels
 /// @param ijkMin The coordinate of the bottom-back-left corner of the grid
-/// @param mask An optional mask tensor that can be used to mask out some of the voxels (shape = size)
+/// @param mask An optional mask tensor that can be used to mask out some of the voxels (shape =
+/// size)
 /// @return A handle to the nanovdb grid
 nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGrid(torch::Device device, bool isMutable,
-                                                        const uint32_t batchSize,
-                                                        const nanovdb::Coord& size,
-                                                        const nanovdb::Coord& ijkMin,
-                                                        const torch::optional<torch::Tensor>& mask);
+                                                      const uint32_t        batchSize,
+                                                      const nanovdb::Coord &size,
+                                                      const nanovdb::Coord &ijkMin,
+                                                      const torch::optional<torch::Tensor> &mask);
 
 /// @brief Build a NanoVDB grid representing the coarse grid of a given fine grid
 /// @param isMutable Whether the grid should be mutable or not
 /// @param fineGridHdl The handle to the fine grid
-/// @param branchingFactor The coarsening factor from the fine grid to the coarse grid (i.e. N = [2, 2, 2] for a 2x2x2 coarsening)
+/// @param branchingFactor The coarsening factor from the fine grid to the coarse grid (i.e. N = [2,
+/// 2, 2] for a 2x2x2 coarsening)
 /// @return A handle to the nanovdb grid (the device will match fineGridHdl)
-nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGrid(bool isMutable,
-                                                                     const GridBatchImpl& fineGridHdl,
-                                                                     const nanovdb::Coord branchingFactor);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildCoarseGridFromFineGrid(bool isMutable, const GridBatchImpl &fineGridHdl,
+                            const nanovdb::Coord branchingFactor);
 
 /// @brief Build a NanoVDB grid representing the fine grid of a given coarse grid
 /// @param isMutable Whether the grid should be mutable or not
 /// @param coarseGridHdl The handle to the coarse grid
-/// @param subdivMask An optional mask JaggedTensor that can be used to not refine certain voxels (shape = [B, -1] matching number of coarse voxels)
-/// @param subdivisionFactor The refinement factor from the coarse grid to the fine grid (i.e. (2, 2, 2) for a 2x2x2 refinement)
+/// @param subdivMask An optional mask JaggedTensor that can be used to not refine certain voxels
+/// (shape = [B, -1] matching number of coarse voxels)
+/// @param subdivisionFactor The refinement factor from the coarse grid to the fine grid (i.e. (2,
+/// 2, 2) for a 2x2x2 refinement)
 /// @return A handle to the nanovdb grid (the device will match coarseGridHdl)
-nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGrid(bool isMutable,
-                                                                     const GridBatchImpl& coarseGridHdl,
-                                                                     const torch::optional<JaggedTensor>& subdivMask,
-                                                                     const nanovdb::Coord subdivisionFactor);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildFineGridFromCoarseGrid(bool isMutable, const GridBatchImpl &coarseGridHdl,
+                            const torch::optional<JaggedTensor> &subdivMask,
+                            const nanovdb::Coord                 subdivisionFactor);
 
-nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGrid(bool isMutable,
-                                                               const GridBatchImpl& baseGridHdl,
-                                                               const nanovdb::Coord& kernelSize, const nanovdb::Coord& stride);
+nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGrid(bool                  isMutable,
+                                                             const GridBatchImpl  &baseGridHdl,
+                                                             const nanovdb::Coord &kernelSize,
+                                                             const nanovdb::Coord &stride);
 
 /// @brief Build a NanoVDB grid which is a padded version of the given grid
 /// @param isMutable Whether the grid should be mutable or not
@@ -66,11 +70,13 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGrid(bool isMutable,
 /// @param bmax The padding in the positive direction
 /// @param excludeBorder Whether to exclude the border voxels from padding
 /// @return A handle to the padded nanovdb grid (the device will match baseGridHdl)
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGrid(bool isMutable,
-                                                                 const GridBatchImpl& baseGridHdl,
-                                                                 int bmin, int bmax, bool excludeBorder);
+nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGrid(bool                 isMutable,
+                                                               const GridBatchImpl &baseGridHdl,
+                                                               int bmin, int bmax,
+                                                               bool excludeBorder);
 
-/// @brief Build a NanoVDB grid from a set of points and pad each voxel ijk which contains a point from ijk - bmin to ijk + bmax
+/// @brief Build a NanoVDB grid from a set of points and pad each voxel ijk which contains a point
+/// from ijk - bmin to ijk + bmax
 /// @param device The device on which the grid will be allocated
 /// @param isMutable Whether the grid should be mutable or not
 /// @param points The points to be encoded in the grid (JaggedTensor of shape = (B, -1, 3))
@@ -78,23 +84,24 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGrid(bool isMutable,
 /// @param bmin The minimum padding (i.e. we pad ijk from ijk - bmin to ijk + bmax)
 /// @param bmax The maximum padding (i.e. we pad ijk from ijk - bmin to ijk + bmax)
 /// @return A handle to the nanovdb grid (the device will match points)
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromPoints(bool isMutable,
-                                                                   const JaggedTensor& points,
-                                                                   const std::vector<VoxelCoordTransform>& tx,
-                                                                   const nanovdb::Coord& bmin,
-                                                                   const nanovdb::Coord& bmax);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromPoints(bool isMutable, const JaggedTensor &points,
+                          const std::vector<VoxelCoordTransform> &tx, const nanovdb::Coord &bmin,
+                          const nanovdb::Coord &bmax);
 
-/// @brief Build a NanoVDB grid from a set of points where the 8 nearest voxels to each point are added to the grid
+/// @brief Build a NanoVDB grid from a set of points where the 8 nearest voxels to each point are
+/// added to the grid
 /// @param device The device on which the grid will be allocated
 /// @param isMutable Whether the grid should be mutable or not
 /// @param points The points to be encoded in the grid (JaggedTensor of shape = (B, -1, 3))
 /// @param tx Transform from world to voxel coordinates
 /// @return A handle to the nanovdb grid (the device will match points)
-nanovdb::GridHandle<TorchDeviceBuffer> buildNearestNeighborGridFromPoints(bool isMutable,
-                                                                            const JaggedTensor& points,
-                                                                            const std::vector<VoxelCoordTransform>& tx);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildNearestNeighborGridFromPoints(bool isMutable, const JaggedTensor &points,
+                                   const std::vector<VoxelCoordTransform> &tx);
 
-/// @brief Build a NanoVDB grid from a set of ijk coordinates pad each voxel from ijk - bmin to ijk + bmax
+/// @brief Build a NanoVDB grid from a set of ijk coordinates pad each voxel from ijk - bmin to ijk
+/// + bmax
 /// @param device The device on which the grid will be allocated
 /// @param isMutable Whether the grid should be mutable or not
 /// @param coords The ijk coordinates to be encoded in the grid (JaggedTensor of shape = (B, -1, 3))
@@ -102,22 +109,25 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildNearestNeighborGridFromPoints(bool i
 /// @param bmin The minimum padding (i.e. we pad ijk from ijk - bmin to ijk + bmax)
 /// @param bmax The maximum padding (i.e. we pad ijk from ijk - bmin to ijk + bmax)
 /// @return A handle to the nanovdb grid (the device will match coords)
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromCoords(bool isMutable,
-                                                                   const JaggedTensor& coords,
-                                                                   const nanovdb::Coord& bmin,
-                                                                   const nanovdb::Coord& bmax);
-
+nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromCoords(bool                  isMutable,
+                                                                 const JaggedTensor   &coords,
+                                                                 const nanovdb::Coord &bmin,
+                                                                 const nanovdb::Coord &bmax);
 
-/// @brief Build a NanoVDB grid by voxelizing a mesh (i.e. each voxel in the ouput grid intersects the mesh)
+/// @brief Build a NanoVDB grid by voxelizing a mesh (i.e. each voxel in the ouput grid intersects
+/// the mesh)
 /// @param isMutable Whether the grid should be mutable or not
-/// @param meshVertices A JaggedTensor of shape = (B, -1, 3) containing the vertices of each mesh to voxelize
-/// @param meshFaces A JaggedTensor of shape = (B, -1, 3) containing the face indexes of each mesh to voxelize
+/// @param meshVertices A JaggedTensor of shape = (B, -1, 3) containing the vertices of each mesh to
+/// voxelize
+/// @param meshFaces A JaggedTensor of shape = (B, -1, 3) containing the face indexes of each mesh
+/// to voxelize
 /// @return A handle to the nanovdb grid (the device will match meshVertices and meshFaces)
-nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMesh(bool isMutable,
-                                                           const JaggedTensor meshVertices,
-                                                           const JaggedTensor meshFaces,
-                                                           const std::vector<VoxelCoordTransform>& tx);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildGridFromMesh(bool isMutable, const JaggedTensor meshVertices, const JaggedTensor meshFaces,
+                  const std::vector<VoxelCoordTransform> &tx);
 
 } // namespace build
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_BUILD_BUILD_H
\ No newline at end of file
diff --git a/fvdb/src/detail/build/CoarseFromFine.cpp b/fvdb/src/detail/build/CoarseFromFine.cpp
index 6a403165b1..d4b789d063 100644
--- a/fvdb/src/detail/build/CoarseFromFine.cpp
+++ b/fvdb/src/detail/build/CoarseFromFine.cpp
@@ -3,47 +3,47 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGridCPU(const GridBatchImpl& fineBatchHdl,
-                                                                        const nanovdb::Coord branchingFactor) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildCoarseGridFromFineGridCPU(const GridBatchImpl &fineBatchHdl,
+                               const nanovdb::Coord branchingFactor) {
     using IndexTree = nanovdb::NanoTree<GridType>;
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& fineGridHdl = fineBatchHdl.nanoGridHandle();
+    const nanovdb::GridHandle<TorchDeviceBuffer> &fineGridHdl = fineBatchHdl.nanoGridHandle();
 
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(fineGridHdl.gridCount());
     for (uint32_t bidx = 0; bidx < fineGridHdl.gridCount(); bidx += 1) {
-        const nanovdb::NanoGrid<GridType>* fineGrid = fineGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *fineGrid = fineGridHdl.template grid<GridType>(bidx);
         if (!fineGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
-        const IndexTree& fineTree = fineGrid->tree();
+        const IndexTree &fineTree = fineGrid->tree();
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType>(fineTree); it.isValid(); it++) {
-            const nanovdb::Coord coarseIjk = (it->first.asVec3d() / branchingFactor.asVec3d()).floor();
+            const nanovdb::Coord coarseIjk =
+                (it->first.asVec3d() / branchingFactor.asVec3d()).floor();
             proxyGridAccessor.setValue(coarseIjk, 1.0f);
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -55,12 +55,12 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGridCPU(const Grid
     }
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGrid(bool isMutable,
-                                                                     const GridBatchImpl& fineBatchHdl,
-                                                                     const nanovdb::Coord branchingFactor) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildCoarseGridFromFineGrid(bool isMutable, const GridBatchImpl &fineBatchHdl,
+                            const nanovdb::Coord branchingFactor) {
     if (fineBatchHdl.device().is_cuda()) {
-        JaggedTensor coords = ops::dispatchCoarseIJKForFineGrid<torch::kCUDA>(fineBatchHdl, branchingFactor);
+        JaggedTensor coords =
+            ops::dispatchCoarseIJKForFineGrid<torch::kCUDA>(fineBatchHdl, branchingFactor);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
     } else {
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
@@ -69,7 +69,6 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGrid(bool isMutabl
     }
 }
 
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/ConvGrid.cpp b/fvdb/src/detail/build/ConvGrid.cpp
index 455cfdf9a9..cfa2e56e8d 100644
--- a/fvdb/src/detail/build/ConvGrid.cpp
+++ b/fvdb/src/detail/build/ConvGrid.cpp
@@ -3,46 +3,47 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGridCPU(const GridBatchImpl& fineBatchHdl,
-                                                                        const nanovdb::Coord branchingFactor) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildCoarseGridFromFineGridCPU(const GridBatchImpl &fineBatchHdl,
+                               const nanovdb::Coord branchingFactor) {
     using IndexTree = nanovdb::NanoTree<GridType>;
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& fineGridHdl = fineBatchHdl.nanoGridHandle();
+    const nanovdb::GridHandle<TorchDeviceBuffer> &fineGridHdl = fineBatchHdl.nanoGridHandle();
 
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(fineGridHdl.gridCount());
     for (uint32_t bidx = 0; bidx < fineGridHdl.gridCount(); bidx += 1) {
-        const nanovdb::NanoGrid<GridType>* fineGrid = fineGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *fineGrid = fineGridHdl.template grid<GridType>(bidx);
         if (!fineGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
-        const IndexTree& fineTree = fineGrid->tree();
+        const IndexTree &fineTree = fineGrid->tree();
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType>(fineTree); it.isValid(); it++) {
-            const nanovdb::Coord coarseIjk = (it->first.asVec3d() / branchingFactor.asVec3d()).floor();
+            const nanovdb::Coord coarseIjk =
+                (it->first.asVec3d() / branchingFactor.asVec3d()).floor();
             proxyGridAccessor.setValue(coarseIjk, 1.0f);
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -54,24 +55,23 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildCoarseGridFromFineGridCPU(const Grid
     }
 }
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGridCPU(const GridBatchImpl& baseBatchHdl,
-                                                                  const nanovdb::Coord& kernelSize,
-                                                                  const nanovdb::Coord& stride) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildConvGridFromGridCPU(const GridBatchImpl &baseBatchHdl, const nanovdb::Coord &kernelSize,
+                         const nanovdb::Coord &stride) {
     if (stride == nanovdb::Coord(1) || stride == kernelSize) {
         return buildCoarseGridFromFineGridCPU<GridType>(baseBatchHdl, stride);
     }
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& baseGridHdl = baseBatchHdl.nanoGridHandle();
+    const nanovdb::GridHandle<TorchDeviceBuffer>       &baseGridHdl = baseBatchHdl.nanoGridHandle();
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(baseGridHdl.gridCount());
 
     int lower[3], upper[3];
     for (int i = 0; i < 3; i += 1) {
         if (kernelSize[i] % 2 == 0) {
-            lower[i] = 0; upper[i] = kernelSize[i] - 1;
+            lower[i] = 0;
+            upper[i] = kernelSize[i] - 1;
         } else {
             lower[i] = -(kernelSize[i] - 1) / 2;
             upper[i] = (kernelSize[i] - 1) / 2;
@@ -79,33 +79,37 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGridCPU(const GridBatchI
     }
 
     for (uint32_t bidx = 0; bidx < baseGridHdl.gridCount(); bidx += 1) {
-
-        const nanovdb::NanoGrid<GridType>* baseGrid = baseGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *baseGrid = baseGridHdl.template grid<GridType>(bidx);
         if (!baseGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType>(baseGrid->tree()); it.isValid(); it++) {
-            const nanovdb::Coord& ijk0 = it->first;
+            const nanovdb::Coord &ijk0 = it->first;
 
             for (int di = lower[0]; di <= upper[0]; di += 1) {
                 for (int dj = lower[1]; dj <= upper[1]; dj += 1) {
                     for (int dk = lower[2]; dk <= upper[2]; dk += 1) {
                         const nanovdb::Coord dstIjk = ijk0 + nanovdb::Coord(dk, dj, di);
-                        if (dstIjk[0] % stride[2] != 0 || dstIjk[1] % stride[1] != 0 || dstIjk[2] % stride[0] != 0) continue;
-                        proxyGridAccessor.setValue(nanovdb::Coord(
-                                dstIjk[0] / stride[2], dstIjk[1] / stride[1], dstIjk[2] / stride[0]), 1.0f);
+                        if (dstIjk[0] % stride[2] != 0 || dstIjk[1] % stride[1] != 0 ||
+                            dstIjk[2] % stride[0] != 0)
+                            continue;
+                        proxyGridAccessor.setValue(nanovdb::Coord(dstIjk[0] / stride[2],
+                                                                  dstIjk[1] / stride[1],
+                                                                  dstIjk[2] / stride[0]),
+                                                   1.0f);
                     }
                 }
             }
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -117,19 +121,18 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGridCPU(const GridBatchI
     }
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGrid(bool isMutable,
-                                                               const GridBatchImpl& baseGridHdl,
-                                                               const nanovdb::Coord& kernelSize,
-                                                               const nanovdb::Coord& stride) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildConvGridFromGrid(bool isMutable, const GridBatchImpl &baseGridHdl,
+                      const nanovdb::Coord &kernelSize, const nanovdb::Coord &stride) {
     /**
      * Logic for building the conv grid is the same as torchsparse 2.0.0b.
-     *  However, torchsparse has a bug that creates excessive voxels in the void space, it is fixed in a customized
-     * branch - hence the additional URL for pre-built wheels.
+     *  However, torchsparse has a bug that creates excessive voxels in the void space, it is fixed
+     * in a customized branch - hence the additional URL for pre-built wheels.
      */
 
     if (baseGridHdl.device().is_cuda()) {
-        JaggedTensor coords = ops::dispatchConvIJKForGrid<torch::kCUDA>(baseGridHdl, kernelSize, stride);
+        JaggedTensor coords =
+            ops::dispatchConvIJKForGrid<torch::kCUDA>(baseGridHdl, kernelSize, stride);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
     } else {
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
@@ -138,8 +141,6 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildConvGridFromGrid(bool isMutable,
     }
 }
 
-
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/DenseGrid.cpp b/fvdb/src/detail/build/DenseGrid.cpp
index 2a37717928..5b3b080fc0 100644
--- a/fvdb/src/detail/build/DenseGrid.cpp
+++ b/fvdb/src/detail/build/DenseGrid.cpp
@@ -3,32 +3,28 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGridCPU(const uint32_t batchSize,
-                                                           const nanovdb::Coord& size,
-                                                           const nanovdb::Coord& ijkMin,
-                                                           torch::optional<torch::Tensor> mask) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildDenseGridCPU(const uint32_t batchSize, const nanovdb::Coord &size,
+                  const nanovdb::Coord &ijkMin, torch::optional<torch::Tensor> mask) {
     torch::TensorAccessor<bool, 3> maskAccessor(nullptr, nullptr, nullptr);
     if (mask.has_value()) {
         maskAccessor = mask.value().accessor<bool, 3>();
     }
 
-    using ProxyGridT = nanovdb::tools::build::Grid<float>;
-    auto proxyGrid = std::make_shared<ProxyGridT>(0.0f);
+    using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+    auto proxyGrid         = std::make_shared<ProxyGridT>(0.0f);
     auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
     for (int32_t i = 0; i < size[0]; i += 1) {
@@ -49,7 +45,9 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGridCPU(const uint32_t batchSiz
     }
 
     proxyGridAccessor.merge();
-    nanovdb::GridHandle<TorchDeviceBuffer> ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+    nanovdb::GridHandle<TorchDeviceBuffer> ret =
+        nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u,
+                                                                                false, false);
     ret.buffer().setDevice(torch::kCPU, true /* sync */);
 
     TorchDeviceBuffer guide(0, nullptr);
@@ -69,28 +67,29 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGridCPU(const uint32_t batchSiz
     }
 }
 
-
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGrid(torch::Device device, bool isMutable,
-                                                        const uint32_t batchSize,
-                                                        const nanovdb::Coord& size,
-                                                        const nanovdb::Coord& ijkMin,
-                                                        const torch::optional<torch::Tensor>& mask) {
-
-    TORCH_CHECK(size[0] > 0 && size[1] > 0 && size[2] > 0, "Size must be greater than 0 in all dimensions");
-    TORCH_CHECK((__uint128_t) size[0] * size[1] * size[2] <= std::numeric_limits<int64_t>::max(),
-                                            "Size of dense grid exceeds the number of voxels supported by a GridBatch");
-    TORCH_CHECK((__uint128_t) size[0] * size[1] * size[2] * batchSize <= std::numeric_limits<int64_t>::max(),
-                                            "Size and batch size exceed the number of voxels supported by a GridBatch");
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildDenseGrid(torch::Device device, bool isMutable, const uint32_t batchSize,
+               const nanovdb::Coord &size, const nanovdb::Coord &ijkMin,
+               const torch::optional<torch::Tensor> &mask) {
+    TORCH_CHECK(size[0] > 0 && size[1] > 0 && size[2] > 0,
+                "Size must be greater than 0 in all dimensions");
+    TORCH_CHECK((__uint128_t)size[0] * size[1] * size[2] <= std::numeric_limits<int64_t>::max(),
+                "Size of dense grid exceeds the number of voxels supported by a GridBatch");
+    TORCH_CHECK((__uint128_t)size[0] * size[1] * size[2] * batchSize <=
+                    std::numeric_limits<int64_t>::max(),
+                "Size and batch size exceed the number of voxels supported by a GridBatch");
     if (mask.has_value()) {
-        TORCH_CHECK(mask.value().device() == device, "Mask device must match device of dense grid to build");
+        TORCH_CHECK(mask.value().device() == device,
+                    "Mask device must match device of dense grid to build");
         TORCH_CHECK(mask.value().dtype() == torch::kBool, "Mask must be of type bool");
         TORCH_CHECK(mask.value().dim() == 3, "Mask must be 3D");
-        TORCH_CHECK(mask.value().size(0) == size[0] && mask.value().size(1) == size[1] && mask.value().size(2) == size[2],
+        TORCH_CHECK(mask.value().size(0) == size[0] && mask.value().size(1) == size[1] &&
+                        mask.value().size(2) == size[2],
                     "Mask must have same size as dense grid to build");
     }
     if (device.is_cuda()) {
-        return ops::dispatchCreateNanoGridFromDense<torch::kCUDA>(batchSize, ijkMin, size, isMutable, device, mask);
+        return ops::dispatchCreateNanoGridFromDense<torch::kCUDA>(batchSize, ijkMin, size,
+                                                                  isMutable, device, mask);
     } else {
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
             return buildDenseGridCPU<GridType>(batchSize, size, ijkMin, mask);
@@ -98,7 +97,6 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildDenseGrid(torch::Device device, bool
     }
 }
 
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/EmptyGrid.cpp b/fvdb/src/detail/build/EmptyGrid.cpp
index b490125b84..eb07a527d5 100644
--- a/fvdb/src/detail/build/EmptyGrid.cpp
+++ b/fvdb/src/detail/build/EmptyGrid.cpp
@@ -3,32 +3,31 @@
 //
 #include "Build.h"
 
-#include "detail/utils/Utils.h"
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildEmptyGrid(torch::Device device, bool isMutable) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildEmptyGrid(torch::Device device, bool isMutable) {
     return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(0.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(0.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(device, true /* sync */);
         return ret;
     });
 }
 
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/FineFromCoarse.cpp b/fvdb/src/detail/build/FineFromCoarse.cpp
index efa6c41ff4..7deede22ed 100644
--- a/fvdb/src/detail/build/FineFromCoarse.cpp
+++ b/fvdb/src/detail/build/FineFromCoarse.cpp
@@ -3,40 +3,37 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGridCPU(const GridBatchImpl& coarseBatchHdl,
-                                                                        const torch::Tensor& subdivMask,
-                                                                        const nanovdb::Coord subdivisionFactor) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildFineGridFromCoarseGridCPU(const GridBatchImpl &coarseBatchHdl, const torch::Tensor &subdivMask,
+                               const nanovdb::Coord subdivisionFactor) {
     using IndexTree = nanovdb::NanoTree<GridType>;
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& coarseGridHdl = coarseBatchHdl.nanoGridHandle();
-    const torch::TensorAccessor<bool, 1>& subdivMaskAcc = subdivMask.accessor<bool, 1>();
+    const nanovdb::GridHandle<TorchDeviceBuffer> &coarseGridHdl = coarseBatchHdl.nanoGridHandle();
+    const torch::TensorAccessor<bool, 1>         &subdivMaskAcc = subdivMask.accessor<bool, 1>();
 
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(coarseGridHdl.gridCount());
     for (uint32_t bidx = 0; bidx < coarseGridHdl.gridCount(); bidx += 1) {
-        const nanovdb::NanoGrid<GridType>* coarseGrid = coarseGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *coarseGrid = coarseGridHdl.template grid<GridType>(bidx);
         if (!coarseGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
-        const IndexTree& coarseTree = coarseGrid->tree();
+        const IndexTree &coarseTree = coarseGrid->tree();
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType, -1>(coarseTree); it.isValid(); it++) {
@@ -59,7 +56,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGridCPU(const Grid
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -71,14 +69,13 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGridCPU(const Grid
     }
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGrid(bool isMutable,
-                                                                     const GridBatchImpl& coarseBatchHdl,
-                                                                     const torch::optional<JaggedTensor>& subdivMask,
-                                                                     const nanovdb::Coord subdivisionFactor) {
-
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildFineGridFromCoarseGrid(bool isMutable, const GridBatchImpl &coarseBatchHdl,
+                            const torch::optional<JaggedTensor> &subdivMask,
+                            const nanovdb::Coord                 subdivisionFactor) {
     if (coarseBatchHdl.device().is_cuda()) {
-        JaggedTensor coords = ops::dispatchFineIJKForCoarseGrid<torch::kCUDA>(coarseBatchHdl, subdivisionFactor, subdivMask);
+        JaggedTensor coords = ops::dispatchFineIJKForCoarseGrid<torch::kCUDA>(
+            coarseBatchHdl, subdivisionFactor, subdivMask);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
     } else {
         torch::Tensor subdivMaskTensor;
@@ -88,12 +85,12 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildFineGridFromCoarseGrid(bool isMutabl
             subdivMaskTensor = torch::zeros(0, torch::TensorOptions().dtype(torch::kBool));
         }
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
-            return buildFineGridFromCoarseGridCPU<GridType>(coarseBatchHdl, subdivMaskTensor, subdivisionFactor);
+            return buildFineGridFromCoarseGridCPU<GridType>(coarseBatchHdl, subdivMaskTensor,
+                                                            subdivisionFactor);
         });
     }
 }
 
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/FromMesh.cpp b/fvdb/src/detail/build/FromMesh.cpp
index cda2254ced..b0025b2419 100644
--- a/fvdb/src/detail/build/FromMesh.cpp
+++ b/fvdb/src/detail/build/FromMesh.cpp
@@ -3,54 +3,54 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType, typename ScalarType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMeshCPU(const JaggedTensor& vertices,
-                                                            const JaggedTensor& triangles,
-                                                            const std::vector<VoxelCoordTransform>& tx) {
-
-    using Vec3T = nanovdb::math::Vec3<ScalarType>;
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildGridFromMeshCPU(const JaggedTensor &vertices, const JaggedTensor &triangles,
+                     const std::vector<VoxelCoordTransform> &tx) {
+    using Vec3T      = nanovdb::math::Vec3<ScalarType>;
     using ProxyGridT = nanovdb::tools::build::Grid<float>;
 
-
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(vertices.num_outer_lists());
 
     for (int64_t bidx = 0; bidx < vertices.num_outer_lists(); bidx += 1) {
+        const torch::Tensor        ti  = triangles.index({ bidx }).jdata();
+        const torch::Tensor        vi  = vertices.index({ bidx }).jdata();
+        const VoxelCoordTransform &txi = tx[bidx];
 
-        const torch::Tensor ti = triangles.index({bidx}).jdata();
-        const torch::Tensor vi = vertices.index({bidx}).jdata();
-        const VoxelCoordTransform& txi = tx[bidx];
-
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         // int64_t numSearched = 0;
         // int64_t numFound = 0;
         // For eacjh face, compute thee min max voxels
         for (int faceId = 0; faceId < ti.size(0); faceId += 1) {
-            const torch::Tensor face = ti.index({faceId});  // 3
-            const torch::Tensor faceVertices = vi.index({face});  // [3, 3]
-            torch::TensorAccessor<ScalarType, 2> faceVerticesAcc = faceVertices.accessor<ScalarType, 2>();
-            const Vec3T v1 = txi.apply(Vec3T(faceVerticesAcc[0][0], faceVerticesAcc[0][1], faceVerticesAcc[0][2]));
-            const Vec3T v2 = txi.apply(Vec3T(faceVerticesAcc[1][0], faceVerticesAcc[1][1], faceVerticesAcc[1][2]));
-            const Vec3T v3 = txi.apply(Vec3T(faceVerticesAcc[2][0], faceVerticesAcc[2][1], faceVerticesAcc[2][2]));
-
-            const Vec3T e1 = v2 - v1;
-            const Vec3T e2 = v3 - v1;
-            const ScalarType spacing = sqrt(3.0) / 3.0; // This is very conservative spacing but fine for now
+            const torch::Tensor                  face         = ti.index({ faceId }); // 3
+            const torch::Tensor                  faceVertices = vi.index({ face });   // [3, 3]
+            torch::TensorAccessor<ScalarType, 2> faceVerticesAcc =
+                faceVertices.accessor<ScalarType, 2>();
+            const Vec3T v1 = txi.apply(
+                Vec3T(faceVerticesAcc[0][0], faceVerticesAcc[0][1], faceVerticesAcc[0][2]));
+            const Vec3T v2 = txi.apply(
+                Vec3T(faceVerticesAcc[1][0], faceVerticesAcc[1][1], faceVerticesAcc[1][2]));
+            const Vec3T v3 = txi.apply(
+                Vec3T(faceVerticesAcc[2][0], faceVerticesAcc[2][1], faceVerticesAcc[2][2]));
+
+            const Vec3T      e1 = v2 - v1;
+            const Vec3T      e2 = v3 - v1;
+            const ScalarType spacing =
+                sqrt(3.0) / 3.0; // This is very conservative spacing but fine for now
             const int32_t numU = ceil((e1.length() + spacing) / spacing);
             const int32_t numV = ceil((e2.length() + spacing) / spacing);
 
@@ -63,7 +63,7 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMeshCPU(const JaggedTensor&
                         u = 1.0 - u;
                         v = 1.0 - v;
                     }
-                    const Vec3T p = v1 + e1 * u + e2 * v;
+                    const Vec3T          p   = v1 + e1 * u + e2 * v;
                     const nanovdb::Coord ijk = p.round();
 
                     proxyGridAccessor.setValue(ijk, 1.0f);
@@ -75,7 +75,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMeshCPU(const JaggedTensor&
         // std::cerr << "I searched over " << numSearched << " voxels" << std::endl;
         // std::cerr << "I found " << numFound << " voxels" << std::endl;
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -87,24 +88,22 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMeshCPU(const JaggedTensor&
     }
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildGridFromMesh(bool isMutable,
-                                                         const JaggedTensor meshVertices,
-                                                         const JaggedTensor meshFaces,
-                                                         const std::vector<VoxelCoordTransform>& tx) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildGridFromMesh(bool isMutable, const JaggedTensor meshVertices, const JaggedTensor meshFaces,
+                  const std::vector<VoxelCoordTransform> &tx) {
     if (meshVertices.device().is_cuda()) {
         JaggedTensor coords = ops::dispatchIJKForMesh<torch::kCUDA>(meshVertices, meshFaces, tx);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
     } else {
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
-            return AT_DISPATCH_FLOATING_TYPES(meshVertices.scalar_type(), "buildGridFromMeshCPU", [&]() {
-                return buildGridFromMeshCPU<GridType, scalar_t>(meshVertices, meshFaces, tx);
-            });
+            return AT_DISPATCH_FLOATING_TYPES(
+                meshVertices.scalar_type(), "buildGridFromMeshCPU", [&]() {
+                    return buildGridFromMeshCPU<GridType, scalar_t>(meshVertices, meshFaces, tx);
+                });
         });
     }
 }
 
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
index d763990513..805aa358a7 100644
--- a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
+++ b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp
@@ -3,89 +3,90 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildNearestNeighborGridFromPointsCPU(const JaggedTensor& jaggedPoints,
-                                                                             const std::vector<VoxelCoordTransform>& txs) {
-
-    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(jaggedPoints.scalar_type(), "buildNearestNeighborGridFromPoints", [&]() {
-        using ScalarT = scalar_t;
-        using MathT = typename at::opmath_type<ScalarT>;
-        using Vec3T = typename nanovdb::math::Vec3<MathT>;
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-
-        static_assert(is_floating_point_or_half<ScalarT>::value, "Invalid type for points, must be floating point");
-
-        jaggedPoints.check_valid();
-
-        const torch::TensorAccessor<ScalarT, 2>& pointsAcc = jaggedPoints.jdata().accessor<ScalarT, 2>();
-        const torch::TensorAccessor<fvdb::JOffsetsType, 1>& pointsBOffsetsAcc = jaggedPoints.joffsets().accessor<fvdb::JOffsetsType, 1>();
-
-        std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
-        batchHandles.reserve(pointsBOffsetsAcc.size(0) - 1);
-        for (int bi = 0; bi < (pointsBOffsetsAcc.size(0) - 1); bi += 1) {
-
-            const VoxelCoordTransform& tx = txs[bi];
-
-            auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
-            auto proxyGridAccessor = proxyGrid->getWriteAccessor();
-
-            const int64_t start = pointsBOffsetsAcc[bi];
-            const int64_t end = pointsBOffsetsAcc[bi+1];
-
-            for (int64_t pi = start; pi < end; pi += 1) {
-                Vec3T ijk0 = tx.apply(static_cast<MathT>(pointsAcc[pi][0]),
-                                    static_cast<MathT>(pointsAcc[pi][1]),
-                                    static_cast<MathT>(pointsAcc[pi][2]));
-                nanovdb::Coord ijk000 = ijk0.floor();
-                nanovdb::Coord ijk001 = ijk000 + nanovdb::Coord(0, 0, 1);
-                nanovdb::Coord ijk010 = ijk000 + nanovdb::Coord(0, 1, 0);
-                nanovdb::Coord ijk011 = ijk000 + nanovdb::Coord(0, 1, 1);
-                nanovdb::Coord ijk100 = ijk000 + nanovdb::Coord(1, 0, 0);
-                nanovdb::Coord ijk101 = ijk000 + nanovdb::Coord(1, 0, 1);
-                nanovdb::Coord ijk110 = ijk000 + nanovdb::Coord(1, 1, 0);
-                nanovdb::Coord ijk111 = ijk000 + nanovdb::Coord(1, 1, 1);
-
-                proxyGridAccessor.setValue(ijk000, 11.0f);
-                proxyGridAccessor.setValue(ijk001, 11.0f);
-                proxyGridAccessor.setValue(ijk010, 11.0f);
-                proxyGridAccessor.setValue(ijk011, 11.0f);
-                proxyGridAccessor.setValue(ijk100, 11.0f);
-                proxyGridAccessor.setValue(ijk101, 11.0f);
-                proxyGridAccessor.setValue(ijk110, 11.0f);
-                proxyGridAccessor.setValue(ijk111, 11.0f);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildNearestNeighborGridFromPointsCPU(const JaggedTensor                     &jaggedPoints,
+                                      const std::vector<VoxelCoordTransform> &txs) {
+    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        jaggedPoints.scalar_type(), "buildNearestNeighborGridFromPoints", [&]() {
+            using ScalarT    = scalar_t;
+            using MathT      = typename at::opmath_type<ScalarT>;
+            using Vec3T      = typename nanovdb::math::Vec3<MathT>;
+            using ProxyGridT = nanovdb::tools::build::Grid<float>;
+
+            static_assert(is_floating_point_or_half<ScalarT>::value,
+                          "Invalid type for points, must be floating point");
+
+            jaggedPoints.check_valid();
+
+            const torch::TensorAccessor<ScalarT, 2> &pointsAcc =
+                jaggedPoints.jdata().accessor<ScalarT, 2>();
+            const torch::TensorAccessor<fvdb::JOffsetsType, 1> &pointsBOffsetsAcc =
+                jaggedPoints.joffsets().accessor<fvdb::JOffsetsType, 1>();
+
+            std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
+            batchHandles.reserve(pointsBOffsetsAcc.size(0) - 1);
+            for (int bi = 0; bi < (pointsBOffsetsAcc.size(0) - 1); bi += 1) {
+                const VoxelCoordTransform &tx = txs[bi];
+
+                auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
+                auto proxyGridAccessor = proxyGrid->getWriteAccessor();
+
+                const int64_t start = pointsBOffsetsAcc[bi];
+                const int64_t end   = pointsBOffsetsAcc[bi + 1];
+
+                for (int64_t pi = start; pi < end; pi += 1) {
+                    Vec3T          ijk0   = tx.apply(static_cast<MathT>(pointsAcc[pi][0]),
+                                                     static_cast<MathT>(pointsAcc[pi][1]),
+                                                     static_cast<MathT>(pointsAcc[pi][2]));
+                    nanovdb::Coord ijk000 = ijk0.floor();
+                    nanovdb::Coord ijk001 = ijk000 + nanovdb::Coord(0, 0, 1);
+                    nanovdb::Coord ijk010 = ijk000 + nanovdb::Coord(0, 1, 0);
+                    nanovdb::Coord ijk011 = ijk000 + nanovdb::Coord(0, 1, 1);
+                    nanovdb::Coord ijk100 = ijk000 + nanovdb::Coord(1, 0, 0);
+                    nanovdb::Coord ijk101 = ijk000 + nanovdb::Coord(1, 0, 1);
+                    nanovdb::Coord ijk110 = ijk000 + nanovdb::Coord(1, 1, 0);
+                    nanovdb::Coord ijk111 = ijk000 + nanovdb::Coord(1, 1, 1);
+
+                    proxyGridAccessor.setValue(ijk000, 11.0f);
+                    proxyGridAccessor.setValue(ijk001, 11.0f);
+                    proxyGridAccessor.setValue(ijk010, 11.0f);
+                    proxyGridAccessor.setValue(ijk011, 11.0f);
+                    proxyGridAccessor.setValue(ijk100, 11.0f);
+                    proxyGridAccessor.setValue(ijk101, 11.0f);
+                    proxyGridAccessor.setValue(ijk110, 11.0f);
+                    proxyGridAccessor.setValue(ijk111, 11.0f);
+                }
+
+                proxyGridAccessor.merge();
+                auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+                    *proxyGrid, 0u, false, false);
+                ret.buffer().setDevice(torch::kCPU, true);
+                batchHandles.push_back(std::move(ret));
             }
 
-            proxyGridAccessor.merge();
-            auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
-            ret.buffer().setDevice(torch::kCPU, true);
-            batchHandles.push_back(std::move(ret));
-        }
-
-        if (batchHandles.size() == 1) {
-            return std::move(batchHandles[0]);
-        } else {
-            return nanovdb::mergeGrids(batchHandles);
-        }
-    });
+            if (batchHandles.size() == 1) {
+                return std::move(batchHandles[0]);
+            } else {
+                return nanovdb::mergeGrids(batchHandles);
+            }
+        });
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildNearestNeighborGridFromPoints(bool isMutable,
-                                                                          const JaggedTensor& points,
-                                                                          const std::vector<VoxelCoordTransform>& txs) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildNearestNeighborGridFromPoints(bool isMutable, const JaggedTensor &points,
+                                   const std::vector<VoxelCoordTransform> &txs) {
     if (points.device().is_cuda()) {
         JaggedTensor coords = ops::dispatchNearestNeighborIJKForPoints<torch::kCUDA>(points, txs);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
@@ -96,8 +97,6 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildNearestNeighborGridFromPoints(bool i
     }
 }
 
-
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/PaddedGridFromCoords.cpp b/fvdb/src/detail/build/PaddedGridFromCoords.cpp
index c45554d361..275c393b0d 100644
--- a/fvdb/src/detail/build/PaddedGridFromCoords.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromCoords.cpp
@@ -3,82 +3,82 @@
 //
 #include "Build.h"
 
-#include <ATen/OpMathType.h>
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/GridBuilder.h>
 
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <ATen/OpMathType.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromCoordsCPU(const JaggedTensor& jaggedCoords,
-                                                                    const nanovdb::Coord& bmin,
-                                                                    const nanovdb::Coord& bmax) {
-
-    return AT_DISPATCH_INTEGRAL_TYPES(jaggedCoords.scalar_type(), "buildPaddedGridFromCoords", [&]() {
-        using ScalarT = scalar_t;
-        jaggedCoords.check_valid();
-
-        static_assert(std::is_integral<ScalarT>::value, "Invalid type for coords, must be integral");
-
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-
-        const torch::TensorAccessor<ScalarT, 2>& coordsAcc = jaggedCoords.jdata().accessor<ScalarT, 2>();
-        const torch::TensorAccessor<fvdb::JOffsetsType, 1>& coordsBOffsetsAcc = jaggedCoords.joffsets().accessor<fvdb::JOffsetsType, 1>();
-
-        std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
-        batchHandles.reserve(coordsBOffsetsAcc.size(0) - 1);
-        for (int bi = 0; bi < (coordsBOffsetsAcc.size(0) - 1); bi += 1) {
-
-            auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
-            auto proxyGridAccessor = proxyGrid->getWriteAccessor();
-
-            const int64_t start = coordsBOffsetsAcc[bi];
-            const int64_t end = coordsBOffsetsAcc[bi+1];
-
-            for (unsigned ci = start; ci < end; ci += 1) {
-                nanovdb::Coord ijk0(coordsAcc[ci][0], coordsAcc[ci][1], coordsAcc[ci][2]);
-
-                // Splat the normal to the 8 neighboring voxels
-                for (int di = bmin[0]; di <= bmax[0]; di += 1) {
-                    for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
-                        for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
-                            const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
-                            proxyGridAccessor.setValue(ijk, 11);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromCoordsCPU(const JaggedTensor &jaggedCoords, const nanovdb::Coord &bmin,
+                             const nanovdb::Coord &bmax) {
+    return AT_DISPATCH_INTEGRAL_TYPES(
+        jaggedCoords.scalar_type(), "buildPaddedGridFromCoords", [&]() {
+            using ScalarT = scalar_t;
+            jaggedCoords.check_valid();
+
+            static_assert(std::is_integral<ScalarT>::value,
+                          "Invalid type for coords, must be integral");
+
+            using ProxyGridT = nanovdb::tools::build::Grid<float>;
+
+            const torch::TensorAccessor<ScalarT, 2> &coordsAcc =
+                jaggedCoords.jdata().accessor<ScalarT, 2>();
+            const torch::TensorAccessor<fvdb::JOffsetsType, 1> &coordsBOffsetsAcc =
+                jaggedCoords.joffsets().accessor<fvdb::JOffsetsType, 1>();
+
+            std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
+            batchHandles.reserve(coordsBOffsetsAcc.size(0) - 1);
+            for (int bi = 0; bi < (coordsBOffsetsAcc.size(0) - 1); bi += 1) {
+                auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
+                auto proxyGridAccessor = proxyGrid->getWriteAccessor();
+
+                const int64_t start = coordsBOffsetsAcc[bi];
+                const int64_t end   = coordsBOffsetsAcc[bi + 1];
+
+                for (unsigned ci = start; ci < end; ci += 1) {
+                    nanovdb::Coord ijk0(coordsAcc[ci][0], coordsAcc[ci][1], coordsAcc[ci][2]);
+
+                    // Splat the normal to the 8 neighboring voxels
+                    for (int di = bmin[0]; di <= bmax[0]; di += 1) {
+                        for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
+                            for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
+                                const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
+                                proxyGridAccessor.setValue(ijk, 11);
+                            }
                         }
                     }
                 }
-            }
 
-            proxyGridAccessor.merge();
-            auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
-            ret.buffer().setDevice(torch::kCPU, true);
-            batchHandles.push_back(std::move(ret));
-        }
+                proxyGridAccessor.merge();
+                auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+                    *proxyGrid, 0u, false, false);
+                ret.buffer().setDevice(torch::kCPU, true);
+                batchHandles.push_back(std::move(ret));
+            }
 
-        if (batchHandles.size() == 1) {
-            return std::move(batchHandles[0]);
-        } else {
-            return nanovdb::mergeGrids(batchHandles);
-        }
-    });
+            if (batchHandles.size() == 1) {
+                return std::move(batchHandles[0]);
+            } else {
+                return nanovdb::mergeGrids(batchHandles);
+            }
+        });
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromCoords(bool isMutable,
-                                                                 const JaggedTensor& coords,
-                                                                 const nanovdb::Coord& bmin,
-                                                                 const nanovdb::Coord& bmax) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromCoords(bool isMutable, const JaggedTensor &coords, const nanovdb::Coord &bmin,
+                          const nanovdb::Coord &bmax) {
     if (coords.device().is_cuda()) {
-        JaggedTensor buildCoords = ops::dispatchPaddedIJKForCoords<torch::kCUDA>(coords, bmin, bmax);
+        JaggedTensor buildCoords =
+            ops::dispatchPaddedIJKForCoords<torch::kCUDA>(coords, bmin, bmax);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(buildCoords, isMutable);
     } else {
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
diff --git a/fvdb/src/detail/build/PaddedGridFromGrid.cpp b/fvdb/src/detail/build/PaddedGridFromGrid.cpp
index 3108545806..4f78ee99f7 100644
--- a/fvdb/src/detail/build/PaddedGridFromGrid.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromGrid.cpp
@@ -3,48 +3,47 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridWithoutBorderCPU(const GridBatchImpl& baseBatchHdl, int BMIN, int BMAX) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromGridWithoutBorderCPU(const GridBatchImpl &baseBatchHdl, int BMIN, int BMAX) {
     TORCH_CHECK(BMIN <= BMAX, "BMIN must be less than BMAX");
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& baseGridHdl = baseBatchHdl.nanoGridHandle();
+    const nanovdb::GridHandle<TorchDeviceBuffer> &baseGridHdl = baseBatchHdl.nanoGridHandle();
 
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(baseGridHdl.gridCount());
     for (uint32_t bidx = 0; bidx < baseGridHdl.gridCount(); bidx += 1) {
-
-        const nanovdb::NanoGrid<GridType>* baseGrid = baseGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *baseGrid = baseGridHdl.template grid<GridType>(bidx);
         if (!baseGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
         auto baseGridAccessor = baseGrid->getAccessor();
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType>(baseGrid->tree()); it.isValid(); it++) {
-            nanovdb::Coord ijk0 = it->first;
-            bool active = true;
+            nanovdb::Coord ijk0   = it->first;
+            bool           active = true;
             for (int di = BMIN; di <= BMAX && active; di += 1) {
                 for (int dj = BMIN; dj <= BMAX && active; dj += 1) {
                     for (int dk = BMIN; dk <= BMAX && active; dk += 1) {
                         const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
                         if (ijk != ijk0) {
-                            active = active && baseGridAccessor.isActive(ijk); // if any surrounding is off, turn it off.
+                            active = active && baseGridAccessor.isActive(
+                                                   ijk); // if any surrounding is off, turn it off.
                         }
                     }
                 }
@@ -55,7 +54,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridWithoutBorderCPU(c
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -67,25 +67,23 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridWithoutBorderCPU(c
     }
 }
 
-
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridCPU(const GridBatchImpl& baseBatchHdl, int BMIN, int BMAX) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromGridCPU(const GridBatchImpl &baseBatchHdl, int BMIN, int BMAX) {
     TORCH_CHECK(BMIN <= BMAX, "BMIN must be less than BMAX");
 
-    const nanovdb::GridHandle<TorchDeviceBuffer>& baseGridHdl = baseBatchHdl.nanoGridHandle();
+    const nanovdb::GridHandle<TorchDeviceBuffer> &baseGridHdl = baseBatchHdl.nanoGridHandle();
 
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
     batchHandles.reserve(baseGridHdl.gridCount());
     for (uint32_t bidx = 0; bidx < baseGridHdl.gridCount(); bidx += 1) {
-
-        const nanovdb::NanoGrid<GridType>* baseGrid = baseGridHdl.template grid<GridType>(bidx);
+        const nanovdb::NanoGrid<GridType> *baseGrid = baseGridHdl.template grid<GridType>(bidx);
         if (!baseGrid) {
             throw std::runtime_error("Failed to get pointer to nanovdb index grid");
         }
 
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-        auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
+        using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+        auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
         for (auto it = ActiveVoxelIterator<GridType>(baseGrid->tree()); it.isValid(); it++) {
@@ -101,7 +99,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridCPU(const GridBatc
         }
 
         proxyGridAccessor.merge();
-        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
+        auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
         ret.buffer().setDevice(torch::kCPU, true);
         batchHandles.push_back(std::move(ret));
     }
@@ -113,16 +112,17 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGridCPU(const GridBatc
     }
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGrid(bool isMutable,
-                                                                 const GridBatchImpl& baseBatchHdl,
-                                                                 int bmin, int bmax, bool excludeBorder) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromGrid(bool isMutable, const GridBatchImpl &baseBatchHdl, int bmin, int bmax,
+                        bool excludeBorder) {
     if (baseBatchHdl.device().is_cuda()) {
         JaggedTensor coords;
         if (excludeBorder) {
-            coords = ops::dispatchPaddedIJKForGridWithoutBorder<torch::kCUDA>(baseBatchHdl, nanovdb::Coord(bmin), nanovdb::Coord(bmax));
+            coords = ops::dispatchPaddedIJKForGridWithoutBorder<torch::kCUDA>(
+                baseBatchHdl, nanovdb::Coord(bmin), nanovdb::Coord(bmax));
         } else {
-            coords = ops::dispatchPaddedIJKForGrid<torch::kCUDA>(baseBatchHdl, nanovdb::Coord(bmin), nanovdb::Coord(bmax));
+            coords = ops::dispatchPaddedIJKForGrid<torch::kCUDA>(baseBatchHdl, nanovdb::Coord(bmin),
+                                                                 nanovdb::Coord(bmax));
         }
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
     } else {
@@ -136,9 +136,6 @@ nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromGrid(bool isMutable,
     }
 }
 
-
-
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/build/PaddedGridFromPoints.cpp b/fvdb/src/detail/build/PaddedGridFromPoints.cpp
index 55a0871686..2c4219f185 100644
--- a/fvdb/src/detail/build/PaddedGridFromPoints.cpp
+++ b/fvdb/src/detail/build/PaddedGridFromPoints.cpp
@@ -3,97 +3,96 @@
 //
 #include "Build.h"
 
+#include <detail/ops/Ops.h>
+#include <detail/utils/Utils.h>
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/ops/Ops.h"
-
+#include <nanovdb/tools/GridBuilder.h>
 
 namespace fvdb {
 namespace detail {
 namespace build {
 
-
 template <typename GridType>
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromPointsCPU(const JaggedTensor& pointsJagged,
-                                                                    const std::vector<VoxelCoordTransform>& txs,
-                                                                    const nanovdb::Coord& bmin,
-                                                                    const nanovdb::Coord& bmax) {
-    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(pointsJagged.scalar_type(), "buildPaddedGridFromPoints", [&](){
-        using ScalarT = scalar_t;
-        static_assert(is_floating_point_or_half<ScalarT>::value, "Invalid type for points, must be floating point");
-        using MathT = typename at::opmath_type<ScalarT>;
-        using ProxyGridT = nanovdb::tools::build::Grid<float>;
-
-        pointsJagged.check_valid();
-
-        const torch::TensorAccessor<ScalarT, 2>& pointsAcc = pointsJagged.jdata().accessor<ScalarT, 2>();
-        const torch::TensorAccessor<fvdb::JOffsetsType, 1>& pointsBOffsetsAcc = pointsJagged.joffsets().accessor<fvdb::JOffsetsType, 1>();
-
-        std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
-        batchHandles.reserve(pointsBOffsetsAcc.size(0) - 1);
-        for (int bi = 0; bi < (pointsBOffsetsAcc.size(0) - 1); bi += 1) {
-            VoxelCoordTransform tx = txs[bi];
-
-            auto proxyGrid = std::make_shared<ProxyGridT>(-1.0f);
-            auto proxyGridAccessor = proxyGrid->getWriteAccessor();
-
-            const int64_t start = pointsBOffsetsAcc[bi];
-            const int64_t end = pointsBOffsetsAcc[bi+1];
-
-            for (int64_t pi = start; pi < end; pi += 1) {
-
-                nanovdb::Coord ijk0 = tx.apply(static_cast<MathT>(pointsAcc[pi][0]),
-                                               static_cast<MathT>(pointsAcc[pi][1]),
-                                               static_cast<MathT>(pointsAcc[pi][2])).round();
-
-                // Splat the normal to the 8 neighboring voxels
-                for (int di = bmin[0]; di <= bmax[0]; di += 1) {
-                    for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
-                        for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
-                            const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
-                            proxyGridAccessor.setValue(ijk, 1.0f);
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromPointsCPU(const JaggedTensor                     &pointsJagged,
+                             const std::vector<VoxelCoordTransform> &txs,
+                             const nanovdb::Coord &bmin, const nanovdb::Coord &bmax) {
+    return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pointsJagged.scalar_type(), "buildPaddedGridFromPoints", [&]() {
+            using ScalarT = scalar_t;
+            static_assert(is_floating_point_or_half<ScalarT>::value,
+                          "Invalid type for points, must be floating point");
+            using MathT      = typename at::opmath_type<ScalarT>;
+            using ProxyGridT = nanovdb::tools::build::Grid<float>;
+
+            pointsJagged.check_valid();
+
+            const torch::TensorAccessor<ScalarT, 2> &pointsAcc =
+                pointsJagged.jdata().accessor<ScalarT, 2>();
+            const torch::TensorAccessor<fvdb::JOffsetsType, 1> &pointsBOffsetsAcc =
+                pointsJagged.joffsets().accessor<fvdb::JOffsetsType, 1>();
+
+            std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> batchHandles;
+            batchHandles.reserve(pointsBOffsetsAcc.size(0) - 1);
+            for (int bi = 0; bi < (pointsBOffsetsAcc.size(0) - 1); bi += 1) {
+                VoxelCoordTransform tx = txs[bi];
+
+                auto proxyGrid         = std::make_shared<ProxyGridT>(-1.0f);
+                auto proxyGridAccessor = proxyGrid->getWriteAccessor();
+
+                const int64_t start = pointsBOffsetsAcc[bi];
+                const int64_t end   = pointsBOffsetsAcc[bi + 1];
+
+                for (int64_t pi = start; pi < end; pi += 1) {
+                    nanovdb::Coord ijk0 = tx.apply(static_cast<MathT>(pointsAcc[pi][0]),
+                                                   static_cast<MathT>(pointsAcc[pi][1]),
+                                                   static_cast<MathT>(pointsAcc[pi][2]))
+                                              .round();
+
+                    // Splat the normal to the 8 neighboring voxels
+                    for (int di = bmin[0]; di <= bmax[0]; di += 1) {
+                        for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
+                            for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
+                                const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
+                                proxyGridAccessor.setValue(ijk, 1.0f);
+                            }
                         }
                     }
                 }
+
+                proxyGridAccessor.merge();
+                auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(
+                    *proxyGrid, 0u, false, false);
+                ret.buffer().setDevice(torch::kCPU, true);
+                batchHandles.push_back(std::move(ret));
             }
 
-            proxyGridAccessor.merge();
-            auto ret = nanovdb::tools::createNanoGrid<ProxyGridT, GridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
-            ret.buffer().setDevice(torch::kCPU, true);
-            batchHandles.push_back(std::move(ret));
-        }
-
-        if (batchHandles.size() == 1) {
-            return std::move(batchHandles[0]);
-        } else {
-            return nanovdb::mergeGrids(batchHandles);
-        }
-    });
+            if (batchHandles.size() == 1) {
+                return std::move(batchHandles[0]);
+            } else {
+                return nanovdb::mergeGrids(batchHandles);
+            }
+        });
 }
 
-
-nanovdb::GridHandle<TorchDeviceBuffer> buildPaddedGridFromPoints(bool isMutable,
-                                                                 const JaggedTensor& points,
-                                                                 const std::vector<VoxelCoordTransform>& txs,
-                                                                 const nanovdb::Coord& bmin,
-                                                                 const nanovdb::Coord& bmax) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+buildPaddedGridFromPoints(bool isMutable, const JaggedTensor &points,
+                          const std::vector<VoxelCoordTransform> &txs, const nanovdb::Coord &bmin,
+                          const nanovdb::Coord &bmax) {
     if (points.device().is_cuda()) {
-        JaggedTensor coords = ops::dispatchPaddedIJKForPoints<torch::kCUDA>(points, bmin, bmax, txs);
+        JaggedTensor coords =
+            ops::dispatchPaddedIJKForPoints<torch::kCUDA>(points, bmin, bmax, txs);
         return ops::dispatchCreateNanoGridFromIJK<torch::kCUDA>(coords, isMutable);
 
     } else {
-
         return FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
             return buildPaddedGridFromPointsCPU<GridType>(points, txs, bmin, bmax);
         });
     }
 }
 
-
-
 } // namespace build
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/io/IO.h b/fvdb/src/detail/io/IO.h
index da2fd5fedc..64efa98854 100644
--- a/fvdb/src/detail/io/IO.h
+++ b/fvdb/src/detail/io/IO.h
@@ -1,42 +1,46 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/torch.h>
-#include <tuple>
+#ifndef FVDB_DETAIL_IO_IO_H
+#define FVDB_DETAIL_IO_IO_H
 
-#include "GridBatch.h"
+#include <GridBatch.h>
+#include <Types.h>
 
-#include "Types.h"
+#include <torch/torch.h>
+#include <tuple>
 
 namespace fvdb {
 namespace detail {
 namespace io {
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-fromNVDB(nanovdb::GridHandle<nanovdb::HostBuffer>& handle,
-         const torch::optional<fvdb::TorchDeviceOrString> maybeDevice = torch::optional<fvdb::TorchDeviceOrString>());
+fromNVDB(nanovdb::GridHandle<nanovdb::HostBuffer>        &handle,
+         const torch::optional<fvdb::TorchDeviceOrString> maybeDevice =
+             torch::optional<fvdb::TorchDeviceOrString>());
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
-         const torch::optional<fvdb::TorchDeviceOrString> maybeDevice = torch::optional<fvdb::TorchDeviceOrString>());
+fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>> &handles,
+         const torch::optional<fvdb::TorchDeviceOrString>             maybeDevice =
+             torch::optional<fvdb::TorchDeviceOrString>());
 
 nanovdb::GridHandle<nanovdb::HostBuffer>
-toNVDB(const GridBatch& gridBatch,
-       const torch::optional<JaggedTensor> maybeData = torch::optional<JaggedTensor>(),
-       const torch::optional<StringOrListOfStrings> maybeNames = torch::optional<StringOrListOfStrings>());
+toNVDB(const GridBatch                             &gridBatch,
+       const torch::optional<JaggedTensor>          maybeData = torch::optional<JaggedTensor>(),
+       const torch::optional<StringOrListOfStrings> maybeNames =
+           torch::optional<StringOrListOfStrings>());
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-loadNVDB(const std::string& path,
-         const fvdb::NanoVDBFileGridIdentifier& gridIdentifier,
-         fvdb::TorchDeviceOrString device,
-         bool verbose);
-
-void saveNVDB(const std::string& path,
-              const GridBatch& gridBatch,
-              const torch::optional<fvdb::JaggedTensor> maybeData,
+loadNVDB(const std::string &path, const fvdb::NanoVDBFileGridIdentifier &gridIdentifier,
+         fvdb::TorchDeviceOrString device, bool verbose);
+
+void saveNVDB(const std::string &path, const GridBatch &gridBatch,
+              const torch::optional<fvdb::JaggedTensor>          maybeData,
               const torch::optional<fvdb::StringOrListOfStrings> maybeNames,
               bool compressed = false, bool verbose = false);
 
-}  // namespace io
-}  // namespace detail
-}  // namespace fvdb
+} // namespace io
+} // namespace detail
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_IO_IO_H
\ No newline at end of file
diff --git a/fvdb/src/detail/io/LoadNanovdb.cpp b/fvdb/src/detail/io/LoadNanovdb.cpp
index e1dfe1017f..7e509aa605 100644
--- a/fvdb/src/detail/io/LoadNanovdb.cpp
+++ b/fvdb/src/detail/io/LoadNanovdb.cpp
@@ -1,95 +1,121 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include "detail/io/IO.h"
+#include "IO.h"
 
-#include <torch/all.h>
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/io/IO.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/GridBuilder.h>
 
-#include "Types.h"
-#include "detail/utils/Utils.h"
-#include "detail/GridBatchImpl.h"
-
+#include <torch/all.h>
 
 namespace fvdb {
 namespace detail {
 namespace io {
 
-/// @brief Get the gridId^th grid with build type SourceGrid in a grid handle and throw an exception if the grid is none
+/// @brief Get the gridId^th grid with build type SourceGrid in a grid handle and throw an exception
+/// if the grid is none
 /// @tparam GridType The build type of the grid to read
 /// @param handle The grid handle to read from
 /// @param gridId The index of the grid in the handle to read
 /// @param bi The batch index of the grid in the handle to read (this is only used for logging)
 /// @return A host pointer to the extracted grid
 template <typename GridType>
-const nanovdb::NanoGrid<GridType>* getGrid(const nanovdb::GridHandle<nanovdb::HostBuffer>& handle, uint32_t gridId, uint32_t bi) {
-    const nanovdb::NanoGrid<GridType>* grid = handle.grid<GridType>(gridId);
-    char gridTypeStr[nanovdb::strlen<nanovdb::GridType>()];
+const nanovdb::NanoGrid<GridType> *
+getGrid(const nanovdb::GridHandle<nanovdb::HostBuffer> &handle, uint32_t gridId, uint32_t bi) {
+    const nanovdb::NanoGrid<GridType> *grid = handle.grid<GridType>(gridId);
+    char                               gridTypeStr[nanovdb::strlen<nanovdb::GridType>()];
     nanovdb::toStr(gridTypeStr, handle.gridType(gridId));
     char expectedGridTypeStr[nanovdb::strlen<nanovdb::GridType>()];
     nanovdb::toStr(expectedGridTypeStr, nanovdb::toGridType<GridType>());
     TORCH_CHECK(gridId < handle.gridCount(),
-                "Failed to load grid " + std::to_string(gridId) + " from handle at batch index " + std::to_string(bi) +
-                std::string(". Grid index out of bounds."));
-    TORCH_CHECK(grid != nullptr,
-                "Failed to load grid " + std::to_string(gridId) + " from handle at batch index " + std::to_string(bi) +
-                std::string(". Grid has type ") + gridTypeStr +
-                std::string(", but expected ") + expectedGridTypeStr + ".");
+                "Failed to load grid " + std::to_string(gridId) + " from handle at batch index " +
+                    std::to_string(bi) + std::string(". Grid index out of bounds."));
+    TORCH_CHECK(grid != nullptr, "Failed to load grid " + std::to_string(gridId) +
+                                     " from handle at batch index " + std::to_string(bi) +
+                                     std::string(". Grid has type ") + gridTypeStr +
+                                     std::string(", but expected ") + expectedGridTypeStr + ".");
     return grid;
 }
 
-
 /// @brief Set the (row) value at index rowIdx of a tensor with 2 dimensions.
 ///        Specialized to accept useful nanovdb types (e.g. Vec3f, Vec4f, etc...)
-/// @tparam TensorAccessorT The type of tensor accessor to use (e.g. torch::TensorAccessor, torch::PackedTensorAccessor)
-/// @tparam ValueT The input type of the row to write to the tensor (e.g. float, nanovdb::Vec3f, nanovdb::Vec4f)
+/// @tparam TensorAccessorT The type of tensor accessor to use (e.g. torch::TensorAccessor,
+/// torch::PackedTensorAccessor)
+/// @tparam ValueT The input type of the row to write to the tensor (e.g. float, nanovdb::Vec3f,
+/// nanovdb::Vec4f)
 /// @param acc The accessor to the tensor (must refer to a 2D tensor)
 /// @param rowIdx The row to read from
 /// @return The rowIdx^th row of the tensor casted to ValueT
 template <typename TensorAccessorT, class ValueT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const ValueT& value) {
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const ValueT &value) {
     acc[idx][0] = value;
 }
 template <class TensorAccessorT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const nanovdb::Vec3f& value) {
-    acc[idx][0] = value[0]; acc[idx][1] = value[1]; acc[idx][2] = value[2];
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const nanovdb::Vec3f &value) {
+    acc[idx][0] = value[0];
+    acc[idx][1] = value[1];
+    acc[idx][2] = value[2];
 }
 template <class TensorAccessorT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const nanovdb::Vec3d& value) {
-    acc[idx][0] = value[0]; acc[idx][1] = value[1]; acc[idx][2] = value[2];
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const nanovdb::Vec3d &value) {
+    acc[idx][0] = value[0];
+    acc[idx][1] = value[1];
+    acc[idx][2] = value[2];
 }
 template <class TensorAccessorT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const nanovdb::Vec4f& value) {
-    acc[idx][0] = value[0]; acc[idx][1] = value[1]; acc[idx][2] = value[2]; acc[idx][3] = value[3];
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const nanovdb::Vec4f &value) {
+    acc[idx][0] = value[0];
+    acc[idx][1] = value[1];
+    acc[idx][2] = value[2];
+    acc[idx][3] = value[3];
 }
 template <class TensorAccessorT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const nanovdb::Vec4d& value) {
-    acc[idx][0] = value[0]; acc[idx][1] = value[1]; acc[idx][2] = value[2]; acc[idx][3] = value[3];
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const nanovdb::Vec4d &value) {
+    acc[idx][0] = value[0];
+    acc[idx][1] = value[1];
+    acc[idx][2] = value[2];
+    acc[idx][3] = value[3];
 }
 template <class TensorAccessorT>
-inline void valueSetter(TensorAccessorT& acc, int idx, const nanovdb::math::Rgba8& value) {
-    acc[idx][0] = value.r(); acc[idx][1] = value.g(); acc[idx][2] = value.b(); acc[idx][3] = value.a();
+inline void
+valueSetter(TensorAccessorT &acc, int idx, const nanovdb::math::Rgba8 &value) {
+    acc[idx][0] = value.r();
+    acc[idx][1] = value.g();
+    acc[idx][2] = value.b();
+    acc[idx][3] = value.a();
 }
 
 /// @brief Return whether a nanovdb blind metadata is a valid FVDB tensor grid blind metadata,
 ///        and if so, what the dtype is (if any).
-///        FVDB Blind data is named "fvdb_jdata<dtype>" where dtype is an optional dtype name. If no dtype is specified,
-///        then the blind data just records the  size of the tensor, and the scalar type should be determinied from the
-///        grid type itself (e.g. Vec3f grids will have a float32 scalar type).
+///        FVDB Blind data is named "fvdb_jdata<dtype>" where dtype is an optional dtype name. If no
+///        dtype is specified, then the blind data just records the  size of the tensor, and the
+///        scalar type should be determinied from the grid type itself (e.g. Vec3f grids will have a
+///        float32 scalar type).
 /// @param blindMetadata The blind metadata to check
-/// @return A tuple containing whether the blind metadata is valid, and the dtype of the tensor (or None if no dtype is specified)
-std::tuple<bool, torch::optional<torch::Dtype>> isFvdbBlindData(const nanovdb::GridBlindMetaData& blindMetadata) {
-    if(strncmp(blindMetadata.mName, "fvdb_jdata", 10) != 0) {
+/// @return A tuple containing whether the blind metadata is valid, and the dtype of the tensor (or
+/// None if no dtype is specified)
+std::tuple<bool, torch::optional<torch::Dtype>>
+isFvdbBlindData(const nanovdb::GridBlindMetaData &blindMetadata) {
+    if (strncmp(blindMetadata.mName, "fvdb_jdata", 10) != 0) {
         return std::make_tuple(false, torch::nullopt);
     }
 
     // Check if we load the dtype name, we won't overrun the buffer
-    const int64_t blindDataNameLen = strnlen(blindMetadata.mName, nanovdb::GridBlindMetaData::MaxNameSize);
-    TORCH_CHECK(blindDataNameLen < nanovdb::GridBlindMetaData::MaxNameSize, "Invalid blind metadata for nanovdb grid.");
+    const int64_t blindDataNameLen =
+        strnlen(blindMetadata.mName, nanovdb::GridBlindMetaData::MaxNameSize);
+    TORCH_CHECK(blindDataNameLen < nanovdb::GridBlindMetaData::MaxNameSize,
+                "Invalid blind metadata for nanovdb grid.");
 
     // There's no scalar type specified -- we're just storing a size of the tensor
     if (blindDataNameLen == 10) {
@@ -97,192 +123,238 @@ std::tuple<bool, torch::optional<torch::Dtype>> isFvdbBlindData(const nanovdb::G
     }
 
     // Get the dtype of the blind data tensor
-    const std::string blindDtypeName = std::string(blindMetadata.mName + 10);
-    const torch::Dtype blindDtype = StringToTorchScalarType(blindDtypeName);
+    const std::string  blindDtypeName = std::string(blindMetadata.mName + 10);
+    const torch::Dtype blindDtype     = StringToTorchScalarType(blindDtypeName);
     return std::make_tuple(true, torch::optional<torch::Dtype>(blindDtype));
 }
 
-
-/// @brief Copy a source index grid (ValueIndex(Mask) or ValueOnIndex(Mask)) to a nanovdb::GridHandle<TorchDeviceBuffer>.
-///        If the source type is ValueIndex or ValueIndex mask it will be set to ValueOnIndex or ValueOnIndexMask respectively.
-/// @tparam SourceGridType The type of the source grid (must be a nanovdb::ValueIndex or nanovdb::ValueIndexMask)
+/// @brief Copy a source index grid (ValueIndex(Mask) or ValueOnIndex(Mask)) to a
+/// nanovdb::GridHandle<TorchDeviceBuffer>.
+///        If the source type is ValueIndex or ValueIndex mask it will be set to ValueOnIndex or
+///        ValueOnIndexMask respectively.
+/// @tparam SourceGridType The type of the source grid (must be a nanovdb::ValueIndex or
+/// nanovdb::ValueIndexMask)
 /// @tparam TargetGridType The type of the target grid (must be a form of index grid)
 /// @param sourceGrid A host pointer to the source grid to copy
 /// @return A handle to the copied grid
 template <typename SourceGridType, typename TargetGridType>
-nanovdb::GridHandle<TorchDeviceBuffer> copyIndexGridToHandle(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
-    constexpr bool isSrcValueOnIndex = nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value;
-    constexpr bool isSrcValueOnIndexMask = nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value;
-    constexpr bool isSrcValueIndex = nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndex>::value;
-    constexpr bool isSrcValueIndexMask = nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndexMask>::value;
-    constexpr bool isTgtValueOnIndex = nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndex>::value;
-    constexpr bool isTgtValueOnIndexMask = nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndexMask>::value;
-
-    static_assert(isSrcValueOnIndex || isSrcValueOnIndexMask || isSrcValueIndex || isSrcValueIndexMask,
+nanovdb::GridHandle<TorchDeviceBuffer>
+copyIndexGridToHandle(const nanovdb::NanoGrid<SourceGridType> *sourceGrid) {
+    constexpr bool isSrcValueOnIndex =
+        nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value;
+    constexpr bool isSrcValueOnIndexMask =
+        nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value;
+    constexpr bool isSrcValueIndex =
+        nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndex>::value;
+    constexpr bool isSrcValueIndexMask =
+        nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndexMask>::value;
+    constexpr bool isTgtValueOnIndex =
+        nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndex>::value;
+    constexpr bool isTgtValueOnIndexMask =
+        nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndexMask>::value;
+
+    static_assert(isSrcValueOnIndex || isSrcValueOnIndexMask || isSrcValueIndex ||
+                      isSrcValueIndexMask,
                   "Bad source type in copyIndexGridToHandle must be an Index grid type.");
-    static_assert(isTgtValueOnIndex || isTgtValueOnIndexMask,
-                  "Bad target type in copyIndexGridToHandle must be ValueOnIndex or ValueOnIndexMask.");
-    static_assert((isTgtValueOnIndex && (isSrcValueIndex || isSrcValueOnIndex)) ||
-                  (isTgtValueOnIndexMask && (isSrcValueIndexMask || isSrcValueOnIndexMask)),
-                  "Bad target grid type for given source grid type in copyIndexGridToHandle. If source is a masked grid, then target must also be a masked grid.");
-
-    const ptrdiff_t gridSize = sourceGrid->blindDataCount() > 0 ? nanovdb::util::PtrDiff(&sourceGrid->blindMetaData(0), sourceGrid) : sourceGrid->gridSize();
+    static_assert(
+        isTgtValueOnIndex || isTgtValueOnIndexMask,
+        "Bad target type in copyIndexGridToHandle must be ValueOnIndex or ValueOnIndexMask.");
+    static_assert(
+        (isTgtValueOnIndex && (isSrcValueIndex || isSrcValueOnIndex)) ||
+            (isTgtValueOnIndexMask && (isSrcValueIndexMask || isSrcValueOnIndexMask)),
+        "Bad target grid type for given source grid type in copyIndexGridToHandle. If source is a masked grid, then target must also be a masked grid.");
+
+    const ptrdiff_t gridSize =
+        sourceGrid->blindDataCount() > 0
+            ? nanovdb::util::PtrDiff(&sourceGrid->blindMetaData(0), sourceGrid)
+            : sourceGrid->gridSize();
     TorchDeviceBuffer buf(gridSize);
     memcpy(buf.data(), sourceGrid, gridSize);
-    nanovdb::GridData* data = reinterpret_cast<nanovdb::GridData*>(buf.data());
-    data->mGridCount = 1;
-    data->mGridSize = gridSize;
-    data->mGridClass = nanovdb::GridClass::IndexGrid;
-    data->mGridType = nanovdb::toGridType<TargetGridType>();
+    nanovdb::GridData *data = reinterpret_cast<nanovdb::GridData *>(buf.data());
+    data->mGridCount        = 1;
+    data->mGridSize         = gridSize;
+    data->mGridClass        = nanovdb::GridClass::IndexGrid;
+    data->mGridType         = nanovdb::toGridType<TargetGridType>();
     return nanovdb::GridHandle<TorchDeviceBuffer>(std::move(buf));
 }
 
-
-/// @brief Load a nanovdb ValueOnIndex or ValueOnIndexMask grid with tensor blind metatada (GridClass = TensorGrid) into
+/// @brief Load a nanovdb ValueOnIndex or ValueOnIndexMask grid with tensor blind metatada
+/// (GridClass = TensorGrid) into
 ///        an index grid of the same type stored in a TorchDeviceBuffer) and a torch tensor of data
 ///        (i.e. the standard grid format for FVDB).
-/// @tparam SourceGridType The type of the source grid (must be a nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
+/// @tparam SourceGridType The type of the source grid (must be a nanovdb::ValueOnIndex or
+/// nanovdb::ValueOnIndexMask)
 /// @tparam TargetGridType The type of the target grid (must be a form of index grid)
 /// @param sourceGrid A host pointer to the source grid to load
-/// @return A tuple containing the index grid, the name of the grid, the tensor of data, the voxel size, and the voxel origin
+/// @return A tuple containing the index grid, the name of the grid, the tensor of data, the voxel
+/// size, and the voxel origin
 template <class SourceGridType, class TargetGridType>
-std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d, nanovdb::Vec3d>
-nanovdbTensorGridToFVDBGrid(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
-    static_assert(nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value ||
-                  nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value,
-                  "Bad source grid type in nanovdbTensorGridToFVDBGrid. Must be ValueOnIndex or ValueOnIndexMask.");
-    static_assert(nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndex>::value ||
-                  nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndexMask>::value,
-                  "Bad target grid type in nanovdbTensorGridToFVDBGrid. Must be ValueOnIndex or ValueOnIndexMask.");
-    static_assert(nanovdb::util::is_same<SourceGridType, TargetGridType>::value,
-                  "Mismatched source and target grid types in nanovdbTensorGridToFVDBGrid. They must be identical.");
-
-    TORCH_CHECK(sourceGrid->gridClass() == nanovdb::GridClass::TensorGrid, "Invalid grid class: Index grids which are not saved with fVDB are not yet supported.");
+std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d,
+           nanovdb::Vec3d>
+nanovdbTensorGridToFVDBGrid(const nanovdb::NanoGrid<SourceGridType> *sourceGrid) {
+    static_assert(
+        nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value ||
+            nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value,
+        "Bad source grid type in nanovdbTensorGridToFVDBGrid. Must be ValueOnIndex or ValueOnIndexMask.");
+    static_assert(
+        nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndex>::value ||
+            nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndexMask>::value,
+        "Bad target grid type in nanovdbTensorGridToFVDBGrid. Must be ValueOnIndex or ValueOnIndexMask.");
+    static_assert(
+        nanovdb::util::is_same<SourceGridType, TargetGridType>::value,
+        "Mismatched source and target grid types in nanovdbTensorGridToFVDBGrid. They must be identical.");
+
+    TORCH_CHECK(
+        sourceGrid->gridClass() == nanovdb::GridClass::TensorGrid,
+        "Invalid grid class: Index grids which are not saved with fVDB are not yet supported.");
 
     // Copy the index grid from the loaded buffer and update metadata to be consisten with FVDB
-    nanovdb::GridHandle<TorchDeviceBuffer> retHandle = copyIndexGridToHandle<SourceGridType, TargetGridType>(sourceGrid);
+    nanovdb::GridHandle<TorchDeviceBuffer> retHandle =
+        copyIndexGridToHandle<SourceGridType, TargetGridType>(sourceGrid);
 
     // Check if this grid has FVDB blind data attached to it
-    bool foundFVDB = false;
+    bool         foundFVDB = false;
     torch::Dtype blindDtype;
     for (unsigned i = 0; i < sourceGrid->blindDataCount(); i += 1) {
-        const nanovdb::GridBlindMetaData& blindMetadata = sourceGrid->blindMetaData(i);
+        const nanovdb::GridBlindMetaData &blindMetadata = sourceGrid->blindMetaData(i);
         // Don't need to warn for grid name
         if (blindMetadata.mDataClass == nanovdb::GridBlindDataClass::GridName) {
             continue;
         }
-        std::tuple<bool, torch::optional<torch::Dtype>> isFvdb = isFvdbBlindData(sourceGrid->blindMetaData(0));
+        std::tuple<bool, torch::optional<torch::Dtype>> isFvdb =
+            isFvdbBlindData(sourceGrid->blindMetaData(0));
         if (std::get<0>(isFvdb)) {
-            TORCH_CHECK(!foundFVDB, "Internal Error: Grid has multiple FVDB blind data tensors. Only one is supported.");
-            TORCH_CHECK(std::get<1>(isFvdb).has_value(), "Invalid blind metadata for nanovdb Tensor grid.");
-            foundFVDB = true;
+            TORCH_CHECK(
+                !foundFVDB,
+                "Internal Error: Grid has multiple FVDB blind data tensors. Only one is supported.");
+            TORCH_CHECK(std::get<1>(isFvdb).has_value(),
+                        "Invalid blind metadata for nanovdb Tensor grid.");
+            foundFVDB  = true;
             blindDtype = std::get<1>(isFvdb).value();
         } else {
-            TORCH_WARN("Grid has blind data, but it is not valid FVDB blind data. Blind data will be ignored.");
+            TORCH_WARN(
+                "Grid has blind data, but it is not valid FVDB blind data. Blind data will be ignored.");
         }
     }
 
-    // If there is no FVDB blind data, this is just an index grid, so just return an empty data tensor
+    // If there is no FVDB blind data, this is just an index grid, so just return an empty data
+    // tensor
     if (!foundFVDB) {
-        return std::make_tuple(std::move(retHandle),
-                               sourceGrid->gridName(),
-                               torch::empty({0}),
+        return std::make_tuple(std::move(retHandle), sourceGrid->gridName(), torch::empty({ 0 }),
                                sourceGrid->data()->mVoxelSize,
                                sourceGrid->data()->mMap.applyMap(nanovdb::Vec3d(0.0)));
     }
 
     // Pointer to actual blind data
-    uint8_t* readHead = (uint8_t*)(sourceGrid->blindMetaData(0).blindData());
+    uint8_t *readHead = (uint8_t *)(sourceGrid->blindMetaData(0).blindData());
 
     // Read the shape of the tensor
-    const int64_t ndim = *reinterpret_cast<int64_t*>(readHead);
+    const int64_t ndim = *reinterpret_cast<int64_t *>(readHead);
     readHead += sizeof(int64_t);
     std::vector<int64_t> blindDataShape;
     blindDataShape.reserve(ndim);
     for (int i = 0; i < ndim; i++) {
-        blindDataShape.push_back(*reinterpret_cast<int64_t*>(readHead));
+        blindDataShape.push_back(*reinterpret_cast<int64_t *>(readHead));
         readHead += sizeof(int64_t);
     }
 
     // Copy the blind data tensor
-    torch::Tensor retData = torch::from_blob(const_cast<uint8_t*>(readHead), blindDataShape, blindDtype).clone();
+    torch::Tensor retData =
+        torch::from_blob(const_cast<uint8_t *>(readHead), blindDataShape, blindDtype).clone();
 
     // Load the name and the transform
-    const std::string name = sourceGrid->gridName();
-    const nanovdb::Vec3d voxSize = sourceGrid->mVoxelSize;
+    const std::string    name      = sourceGrid->gridName();
+    const nanovdb::Vec3d voxSize   = sourceGrid->mVoxelSize;
     const nanovdb::Vec3d voxOrigin = sourceGrid->mMap.applyMap(nanovdb::Vec3d(0.0));
     return std::make_tuple(std::move(retHandle), name, retData, voxSize, voxOrigin);
 }
 
-/// @brief Load a nanovdb index grid (ValueOnIndex(Mask) or ValueIndex(Mask)) into an ValueOnIndex or ValueIndex grid
-///        (stored in a TorchDeviceBuffer) and an empty tensor of data (i.e. the standard grid format for FVDB).
+/// @brief Load a nanovdb index grid (ValueOnIndex(Mask) or ValueIndex(Mask)) into an ValueOnIndex
+/// or ValueIndex grid
+///        (stored in a TorchDeviceBuffer) and an empty tensor of data (i.e. the standard grid
+///        format for FVDB).
 /// @tparam SourceGridType The type of the source grid (must not be an index grid)
-/// @tparam TargetGridType The type of the target grid (must be a nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
+/// @tparam TargetGridType The type of the target grid (must be a nanovdb::ValueOnIndex or
+/// nanovdb::ValueOnIndexMask)
 /// @param sourceGrid A host pointer to the source grid to load
-/// @return A tuple containing the index grid, the name of the grid, the empty tensor of data, the voxel size, and the voxel origin
+/// @return A tuple containing the index grid, the name of the grid, the empty tensor of data, the
+/// voxel size, and the voxel origin
 template <class SourceGridType, class TargetGridType>
-std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d, nanovdb::Vec3d>
-nanovdbIndexGridToFVDBGrid(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
-    nanovdb::GridHandle<TorchDeviceBuffer> retHandle = copyIndexGridToHandle<SourceGridType, TargetGridType>(sourceGrid);
-    const std::string name = sourceGrid->gridName();
-    const nanovdb::Vec3d voxSize = sourceGrid->data()->mVoxelSize;
+std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d,
+           nanovdb::Vec3d>
+nanovdbIndexGridToFVDBGrid(const nanovdb::NanoGrid<SourceGridType> *sourceGrid) {
+    nanovdb::GridHandle<TorchDeviceBuffer> retHandle =
+        copyIndexGridToHandle<SourceGridType, TargetGridType>(sourceGrid);
+    const std::string    name      = sourceGrid->gridName();
+    const nanovdb::Vec3d voxSize   = sourceGrid->data()->mVoxelSize;
     const nanovdb::Vec3d voxOrigin = sourceGrid->data()->mMap.applyMap(nanovdb::Vec3d(0.0));
-    return std::make_tuple(std::move(retHandle), name, torch::empty({0}), voxSize, voxOrigin);
+    return std::make_tuple(std::move(retHandle), name, torch::empty({ 0 }), voxSize, voxOrigin);
 }
 
-
-/// @brief Load a nanovdb grid with scalar or vector data stored in the leaves into a ValueOnIndex grid
-///        (stored in a TorchDeviceBuffer) and a tensor of data (i.e. the standard grid format for FVDB).
+/// @brief Load a nanovdb grid with scalar or vector data stored in the leaves into a ValueOnIndex
+/// grid
+///        (stored in a TorchDeviceBuffer) and a tensor of data (i.e. the standard grid format for
+///        FVDB).
 /// @tparam SourceGridType The type of the source grid (must not be an index grid)
-/// @tparam TargetGridType The type of the target grid (must be a nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
+/// @tparam TargetGridType The type of the target grid (must be a nanovdb::ValueOnIndex or
+/// nanovdb::ValueOnIndexMask)
 /// @tparam ScalarType The scalar type of data stored in the source grid
 /// @tparam DataDim The dimension of the data stored in the source grid
 /// @param sourceGrid A host pointer to the source grid to load
-/// @return A tuple containing the index grid, the name of the grid, the tensor of data, the voxel size, and the voxel origin
+/// @return A tuple containing the index grid, the name of the grid, the tensor of data, the voxel
+/// size, and the voxel origin
 template <class SourceGridType, class ScalarType, class TargetGridType, int DataDim>
-std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d, nanovdb::Vec3d>
-nanovdbGridToFvdbGrid(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
+std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d,
+           nanovdb::Vec3d>
+nanovdbGridToFvdbGrid(const nanovdb::NanoGrid<SourceGridType> *sourceGrid) {
     static_assert(nanovdb::util::is_same<TargetGridType, nanovdb::ValueOnIndex>::value,
                   "Bad target type in copyIndexGridToHandle must be ValueOnIndex.");
 
-    static_assert(!nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value &&
-                  !nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value &&
-                  !nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndex>::value &&
-                  !nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndexMask>::value,
-                  "Bad source type in nanovdbGridToIndexGridAndData must NOT be an Index grid type.");
+    static_assert(
+        !nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndex>::value &&
+            !nanovdb::util::is_same<SourceGridType, nanovdb::ValueOnIndexMask>::value &&
+            !nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndex>::value &&
+            !nanovdb::util::is_same<SourceGridType, nanovdb::ValueIndexMask>::value,
+        "Bad source type in nanovdbGridToIndexGridAndData must NOT be an Index grid type.");
 
     // Create the index grid for the loaded grid
-    using ProxyGridT = nanovdb::tools::build::Grid<float>;
-    auto proxyGrid = std::make_shared<ProxyGridT>(0.0f);
+    using ProxyGridT       = nanovdb::tools::build::Grid<float>;
+    auto proxyGrid         = std::make_shared<ProxyGridT>(0.0f);
     auto proxyGridAccessor = proxyGrid->getWriteAccessor();
-    for (auto it = ActiveVoxelIteratorIJKOnly<SourceGridType>(sourceGrid->tree()); it.isValid(); it++) {
+    for (auto it = ActiveVoxelIteratorIJKOnly<SourceGridType>(sourceGrid->tree()); it.isValid();
+         it++) {
         proxyGridAccessor.setValue(*it, 1.0f);
     }
     proxyGridAccessor.merge();
-    nanovdb::GridHandle<TorchDeviceBuffer> retHandle = nanovdb::tools::createNanoGrid<ProxyGridT, TargetGridType, TorchDeviceBuffer>(*proxyGrid, 0u, false, false);
-    nanovdb::NanoGrid<TargetGridType>* outGrid = retHandle.template grid<TargetGridType>();
+    nanovdb::GridHandle<TorchDeviceBuffer> retHandle =
+        nanovdb::tools::createNanoGrid<ProxyGridT, TargetGridType, TorchDeviceBuffer>(
+            *proxyGrid, 0u, false, false);
+    nanovdb::NanoGrid<TargetGridType> *outGrid = retHandle.template grid<TargetGridType>();
     TORCH_CHECK(outGrid != nullptr, "Internal error: failed to get outGrid.");
-    TORCH_CHECK(outGrid->gridClass() == nanovdb::GridClass::IndexGrid, "Internal error: outGrid is not an index grid.");
-    TORCH_CHECK(outGrid->gridType() == nanovdb::GridType::OnIndex || outGrid->gridType() == nanovdb::GridType::OnIndexMask,
+    TORCH_CHECK(outGrid->gridClass() == nanovdb::GridClass::IndexGrid,
+                "Internal error: outGrid is not an index grid.");
+    TORCH_CHECK(outGrid->gridType() == nanovdb::GridType::OnIndex ||
+                    outGrid->gridType() == nanovdb::GridType::OnIndexMask,
                 "Internal error: outGrid is not an index grid.");
 
     // Load data at the voxels into a tensor
-    int64_t numVox = outGrid->activeVoxelCount();
-    int64_t dim = DataDim;
-    torch::TensorOptions opts = torch::TensorOptions().device(torch::kCPU).dtype<ScalarType>();
-    torch::Tensor outData = torch::empty({numVox, dim}, opts);
-    auto outDataAcc = outData.accessor<ScalarType, 2>();
-    auto sourceGridAccessor = sourceGrid->getAccessor();
+    int64_t              numVox  = outGrid->activeVoxelCount();
+    int64_t              dim     = DataDim;
+    torch::TensorOptions opts    = torch::TensorOptions().device(torch::kCPU).dtype<ScalarType>();
+    torch::Tensor        outData = torch::empty({ numVox, dim }, opts);
+    auto                 outDataAcc         = outData.accessor<ScalarType, 2>();
+    auto                 sourceGridAccessor = sourceGrid->getAccessor();
     for (auto it = ActiveVoxelIterator<TargetGridType, -1>(outGrid->tree()); it.isValid(); it++) {
         valueSetter(outDataAcc, it->second, sourceGridAccessor.getValue(it->first));
     }
 
     // If there's extra blind data we need to load, check if any of it is FVDB blind data.
-    // We use FVDB blind data in save to store the shape of the tensor so we can load it back in the same shape
-    // the user saved it in. This lets us handle saving (N, 1), (1, N, 1), (N, )... shaped tensors properly.
+    // We use FVDB blind data in save to store the shape of the tensor so we can load it back in the
+    // same shape the user saved it in. This lets us handle saving (N, 1), (1, N, 1), (N, )...
+    // shaped tensors properly.
     bool foundFVDB = false;
     for (unsigned i = 0; i < sourceGrid->blindDataCount(); i += 1) {
-        const nanovdb::GridBlindMetaData& blindMetadata = sourceGrid->blindMetaData(i);
+        const nanovdb::GridBlindMetaData &blindMetadata = sourceGrid->blindMetaData(i);
 
         // Don't need to warn for grid name
         if (blindMetadata.mDataClass == nanovdb::GridBlindDataClass::GridName) {
@@ -292,25 +364,30 @@ nanovdbGridToFvdbGrid(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
         // Otherwise, check if this is an FVDB blind data tensor
         std::tuple<bool, torch::optional<torch::Dtype>> isFvdb = isFvdbBlindData(blindMetadata);
         if (!std::get<0>(isFvdb)) {
-            TORCH_WARN("Grid has blind data, but it is not valid FVDB blind data. Blind data will be ignored.");
+            TORCH_WARN(
+                "Grid has blind data, but it is not valid FVDB blind data. Blind data will be ignored.");
         } else {
-            TORCH_CHECK(!foundFVDB, "Internal Error: Grid has multiple FVDB blind data tensors. Only one is supported.");
+            TORCH_CHECK(
+                !foundFVDB,
+                "Internal Error: Grid has multiple FVDB blind data tensors. Only one is supported.");
             foundFVDB = true;
-            TORCH_CHECK(!std::get<1>(isFvdb).has_value(),
-                        "Invalid FVDB blind metadata for nanovdb grid. Should not have extra type.");
+            TORCH_CHECK(
+                !std::get<1>(isFvdb).has_value(),
+                "Invalid FVDB blind metadata for nanovdb grid. Should not have extra type.");
 
             // Pointer to actual blind data
-            uint8_t* readHead = (uint8_t*)(sourceGrid->blindMetaData(0).blindData());
+            uint8_t *readHead = (uint8_t *)(sourceGrid->blindMetaData(0).blindData());
 
             // Read the shape of the tensor
-            const int64_t ndim = *reinterpret_cast<int64_t*>(readHead);
-            TORCH_CHECK(sourceGrid->blindMetaData(0).blindDataSize() == nanovdb::math::AlignUp<32U>(sizeof(int64_t) * (ndim + 1)),
+            const int64_t ndim = *reinterpret_cast<int64_t *>(readHead);
+            TORCH_CHECK(sourceGrid->blindMetaData(0).blindDataSize() ==
+                            nanovdb::math::AlignUp<32U>(sizeof(int64_t) * (ndim + 1)),
                         "Invalid FVDB blind data for nanovdb grid. Unexpected size.");
             readHead += sizeof(int64_t);
             std::vector<int64_t> blindDataShape;
             blindDataShape.reserve(ndim);
             for (int i = 0; i < ndim; i++) {
-                blindDataShape.push_back(*reinterpret_cast<int64_t*>(readHead));
+                blindDataShape.push_back(*reinterpret_cast<int64_t *>(readHead));
                 readHead += sizeof(int64_t);
             }
 
@@ -319,147 +396,157 @@ nanovdbGridToFvdbGrid(const nanovdb::NanoGrid<SourceGridType>* sourceGrid) {
     }
 
     // Load the name and the transform
-    const std::string name = sourceGrid->gridName();
-    const nanovdb::Vec3d voxSize = sourceGrid->data()->mVoxelSize;
+    const std::string    name      = sourceGrid->gridName();
+    const nanovdb::Vec3d voxSize   = sourceGrid->data()->mVoxelSize;
     const nanovdb::Vec3d voxOrigin = sourceGrid->data()->mMap.applyMap(nanovdb::Vec3d(0.0));
 
     return std::make_tuple(std::move(retHandle), name, outData, voxSize, voxOrigin);
 }
 
-
-/// @brief Load a single nanovdb grid in a nanovdb::GridHandle<nanovdb::HostBuffer> into an ValueOnIndex or ValueOnIndexMask grid
-///        stored in a nanovdb::GridHandle<TorchDeviceBuffer> as well as torch::Tensor encoding the data at the voxels
-///        (i.e. the standard format for FVDB).
-///        There are 3 cases:
+/// @brief Load a single nanovdb grid in a nanovdb::GridHandle<nanovdb::HostBuffer> into an
+/// ValueOnIndex or ValueOnIndexMask grid
+///        stored in a nanovdb::GridHandle<TorchDeviceBuffer> as well as torch::Tensor encoding the
+///        data at the voxels (i.e. the standard format for FVDB). There are 3 cases:
 ///          1. The input grid has scalar or vector values at the leaves:
 ///            - Load a ValueOnIndex grid and torch::Tensor of values
-///          2. The input grid is a ValueOnIndex or ValueOnIndexMask and has its grid class set to TensorGrid:
-///            - Load a matching ValueOnIndex or ValueOnIndexMask grid and torch::Tensor of values corresponding to
+///          2. The input grid is a ValueOnIndex or ValueOnIndexMask and has its grid class set to
+///          TensorGrid:
+///            - Load a matching ValueOnIndex or ValueOnIndexMask grid and torch::Tensor of values
+///            corresponding to
 ///              the blind data (if it is present)
-///          3. The input grid is an index grid (ValueIndex(Mask) or ValueOnIndex(Mask)) but doesn't have a TensorGrid class set:
-///            - Load a ValueOnIndex or ValueOnIndexMask grid (depending if the input type has a mask or not) and an empty torch::Tensor of values
+///          3. The input grid is an index grid (ValueIndex(Mask) or ValueOnIndex(Mask)) but doesn't
+///          have a TensorGrid class set:
+///            - Load a ValueOnIndex or ValueOnIndexMask grid (depending if the input type has a
+///            mask or not) and an empty torch::Tensor of values
 ///
 /// @param handle The grid handle to read from
 /// @param gridId The index of the grid in the handle to read
 /// @param bi The batch index of the grid in the handle to read (this is only used for logging)
-/// @return A tuple containing the loaded index grid, the name of the grid, the tensor of data, the voxel size, and the voxel origin
-std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d, nanovdb::Vec3d>
-loadOneGrid(const nanovdb::GridHandle<nanovdb::HostBuffer>& handle, uint32_t gridId, uint32_t bi) {
-
+/// @return A tuple containing the loaded index grid, the name of the grid, the tensor of data, the
+/// voxel size, and the voxel origin
+std::tuple<nanovdb::GridHandle<TorchDeviceBuffer>, std::string, torch::Tensor, nanovdb::Vec3d,
+           nanovdb::Vec3d>
+loadOneGrid(const nanovdb::GridHandle<nanovdb::HostBuffer> &handle, uint32_t gridId, uint32_t bi) {
     if (handle.gridMetaData()->gridClass() == nanovdb::GridClass::TensorGrid) {
-        TORCH_CHECK(handle.gridType() == nanovdb::GridType::OnIndex || handle.gridType() == nanovdb::GridType::OnIndexMask,
-                    "Invalid grid type: Tensor grids which are not saved with fVDB are not yet supported.");
+        TORCH_CHECK(
+            handle.gridType() == nanovdb::GridType::OnIndex ||
+                handle.gridType() == nanovdb::GridType::OnIndexMask,
+            "Invalid grid type: Tensor grids which are not saved with fVDB are not yet supported.");
         if (handle.gridType() == nanovdb::GridType::OnIndex) {
-            const nanovdb::NanoGrid<nanovdb::ValueOnIndex>* sourceGrid = getGrid<nanovdb::ValueOnIndex>(handle, gridId, bi);
-            return nanovdbTensorGridToFVDBGrid<nanovdb::ValueOnIndex, nanovdb::ValueOnIndex>(sourceGrid);
+            const nanovdb::NanoGrid<nanovdb::ValueOnIndex> *sourceGrid =
+                getGrid<nanovdb::ValueOnIndex>(handle, gridId, bi);
+            return nanovdbTensorGridToFVDBGrid<nanovdb::ValueOnIndex, nanovdb::ValueOnIndex>(
+                sourceGrid);
         } else if (handle.gridType() == nanovdb::GridType::OnIndexMask) {
-            const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* sourceGrid = getGrid<nanovdb::ValueOnIndexMask>(handle, gridId, bi);
-            return nanovdbTensorGridToFVDBGrid<nanovdb::ValueOnIndexMask, nanovdb::ValueOnIndexMask>(sourceGrid);
+            const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *sourceGrid =
+                getGrid<nanovdb::ValueOnIndexMask>(handle, gridId, bi);
+            return nanovdbTensorGridToFVDBGrid<nanovdb::ValueOnIndexMask,
+                                               nanovdb::ValueOnIndexMask>(sourceGrid);
         }
     }
 
     switch (handle.gridType()) {
-        case nanovdb::GridType::Float:
-        {
-            const nanovdb::NanoGrid<float>* sourceGrid = getGrid<float>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<float, float, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Double:
-        {
-            const nanovdb::NanoGrid<double>* sourceGrid = getGrid<double>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<double, double, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Int32:
-        {
-            const nanovdb::NanoGrid<int32_t>* sourceGrid = getGrid<int32_t>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<int32_t, int32_t, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Int64:
-        {
-            const nanovdb::NanoGrid<int64_t>* sourceGrid = getGrid<int64_t>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<int64_t, int64_t, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Mask:
-        case nanovdb::GridType::Boolean:
-        {
-            const nanovdb::NanoGrid<nanovdb::ValueMask>* sourceGrid = getGrid<nanovdb::ValueMask>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::ValueMask, bool, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Vec3f:
-        {
-            const nanovdb::NanoGrid<nanovdb::Vec3f>* sourceGrid = getGrid<nanovdb::Vec3f>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::Vec3f, float, nanovdb::ValueOnIndex, 3>(sourceGrid);
-        }
-        case nanovdb::GridType::Vec3d:
-        {
-            const nanovdb::NanoGrid<nanovdb::Vec3d>* sourceGrid = getGrid<nanovdb::Vec3d>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::Vec3d, double, nanovdb::ValueOnIndex, 3>(sourceGrid);
-        }
-        case nanovdb::GridType::RGBA8:
-        {
-            const nanovdb::NanoGrid<nanovdb::math::Rgba8>* sourceGrid = getGrid<nanovdb::math::Rgba8>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::math::Rgba8, uint8_t, nanovdb::ValueOnIndex, 4>(sourceGrid);
-        }
-        case nanovdb::GridType::Vec4f:
-        {
-            const nanovdb::NanoGrid<nanovdb::Vec4f>* sourceGrid = getGrid<nanovdb::Vec4f>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::Vec4f, float, nanovdb::ValueOnIndex, 4>(sourceGrid);
-        }
-        case nanovdb::GridType::Vec4d:
-        {
-            const nanovdb::NanoGrid<nanovdb::Vec4d>* sourceGrid = getGrid<nanovdb::Vec4d>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::Vec4d, double, nanovdb::ValueOnIndex, 4>(sourceGrid);
-        }
-        case nanovdb::GridType::Fp16:
-        {
-            const nanovdb::NanoGrid<nanovdb::Fp16>* sourceGrid = getGrid<nanovdb::Fp16>(handle, gridId, bi);
-            return nanovdbGridToFvdbGrid<nanovdb::Fp16, torch::Half, nanovdb::ValueOnIndex, 1>(sourceGrid);
-        }
-        case nanovdb::GridType::Index:
-        {
-            const nanovdb::NanoGrid<nanovdb::ValueIndex>* sourceGrid = getGrid<nanovdb::ValueIndex>(handle, gridId, bi);
-            return nanovdbIndexGridToFVDBGrid<nanovdb::ValueIndex, nanovdb::ValueOnIndex>(sourceGrid);
-        }
-        case nanovdb::GridType::IndexMask:
-        {
-            const nanovdb::NanoGrid<nanovdb::ValueIndexMask>* sourceGrid = getGrid<nanovdb::ValueIndexMask>(handle, gridId, bi);
-            return nanovdbIndexGridToFVDBGrid<nanovdb::ValueIndexMask, nanovdb::ValueOnIndexMask>(sourceGrid);
-        }
-        case nanovdb::GridType::OnIndex:
-        {
-            const nanovdb::NanoGrid<nanovdb::ValueOnIndex>* sourceGrid = getGrid<nanovdb::ValueOnIndex>(handle, gridId, bi);
-            return nanovdbIndexGridToFVDBGrid<nanovdb::ValueOnIndex, nanovdb::ValueOnIndex>(sourceGrid);
-        }
-        case nanovdb::GridType::OnIndexMask:
-        {
-            const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* sourceGrid = getGrid<nanovdb::ValueOnIndexMask>(handle, gridId, bi);
-            return nanovdbIndexGridToFVDBGrid<nanovdb::ValueOnIndexMask, nanovdb::ValueOnIndexMask>(sourceGrid);
-        }
-        default:
-            // Unhandled cases include: Int16, UInt32, Fp4, Fp8, FpN
-            char gridTypeStr[nanovdb::strlen<nanovdb::GridType>()];
-            nanovdb::toStr(gridTypeStr, handle.gridType());
-            throw std::runtime_error(
-                    std::string("Grid type not supported: ") + gridTypeStr);
+    case nanovdb::GridType::Float: {
+        const nanovdb::NanoGrid<float> *sourceGrid = getGrid<float>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<float, float, nanovdb::ValueOnIndex, 1>(sourceGrid);
+    }
+    case nanovdb::GridType::Double: {
+        const nanovdb::NanoGrid<double> *sourceGrid = getGrid<double>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<double, double, nanovdb::ValueOnIndex, 1>(sourceGrid);
+    }
+    case nanovdb::GridType::Int32: {
+        const nanovdb::NanoGrid<int32_t> *sourceGrid = getGrid<int32_t>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<int32_t, int32_t, nanovdb::ValueOnIndex, 1>(sourceGrid);
+    }
+    case nanovdb::GridType::Int64: {
+        const nanovdb::NanoGrid<int64_t> *sourceGrid = getGrid<int64_t>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<int64_t, int64_t, nanovdb::ValueOnIndex, 1>(sourceGrid);
+    }
+    case nanovdb::GridType::Mask:
+    case nanovdb::GridType::Boolean: {
+        const nanovdb::NanoGrid<nanovdb::ValueMask> *sourceGrid =
+            getGrid<nanovdb::ValueMask>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::ValueMask, bool, nanovdb::ValueOnIndex, 1>(
+            sourceGrid);
+    }
+    case nanovdb::GridType::Vec3f: {
+        const nanovdb::NanoGrid<nanovdb::Vec3f> *sourceGrid =
+            getGrid<nanovdb::Vec3f>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::Vec3f, float, nanovdb::ValueOnIndex, 3>(sourceGrid);
+    }
+    case nanovdb::GridType::Vec3d: {
+        const nanovdb::NanoGrid<nanovdb::Vec3d> *sourceGrid =
+            getGrid<nanovdb::Vec3d>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::Vec3d, double, nanovdb::ValueOnIndex, 3>(sourceGrid);
+    }
+    case nanovdb::GridType::RGBA8: {
+        const nanovdb::NanoGrid<nanovdb::math::Rgba8> *sourceGrid =
+            getGrid<nanovdb::math::Rgba8>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::math::Rgba8, uint8_t, nanovdb::ValueOnIndex, 4>(
+            sourceGrid);
+    }
+    case nanovdb::GridType::Vec4f: {
+        const nanovdb::NanoGrid<nanovdb::Vec4f> *sourceGrid =
+            getGrid<nanovdb::Vec4f>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::Vec4f, float, nanovdb::ValueOnIndex, 4>(sourceGrid);
+    }
+    case nanovdb::GridType::Vec4d: {
+        const nanovdb::NanoGrid<nanovdb::Vec4d> *sourceGrid =
+            getGrid<nanovdb::Vec4d>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::Vec4d, double, nanovdb::ValueOnIndex, 4>(sourceGrid);
+    }
+    case nanovdb::GridType::Fp16: {
+        const nanovdb::NanoGrid<nanovdb::Fp16> *sourceGrid =
+            getGrid<nanovdb::Fp16>(handle, gridId, bi);
+        return nanovdbGridToFvdbGrid<nanovdb::Fp16, torch::Half, nanovdb::ValueOnIndex, 1>(
+            sourceGrid);
+    }
+    case nanovdb::GridType::Index: {
+        const nanovdb::NanoGrid<nanovdb::ValueIndex> *sourceGrid =
+            getGrid<nanovdb::ValueIndex>(handle, gridId, bi);
+        return nanovdbIndexGridToFVDBGrid<nanovdb::ValueIndex, nanovdb::ValueOnIndex>(sourceGrid);
+    }
+    case nanovdb::GridType::IndexMask: {
+        const nanovdb::NanoGrid<nanovdb::ValueIndexMask> *sourceGrid =
+            getGrid<nanovdb::ValueIndexMask>(handle, gridId, bi);
+        return nanovdbIndexGridToFVDBGrid<nanovdb::ValueIndexMask, nanovdb::ValueOnIndexMask>(
+            sourceGrid);
+    }
+    case nanovdb::GridType::OnIndex: {
+        const nanovdb::NanoGrid<nanovdb::ValueOnIndex> *sourceGrid =
+            getGrid<nanovdb::ValueOnIndex>(handle, gridId, bi);
+        return nanovdbIndexGridToFVDBGrid<nanovdb::ValueOnIndex, nanovdb::ValueOnIndex>(sourceGrid);
+    }
+    case nanovdb::GridType::OnIndexMask: {
+        const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *sourceGrid =
+            getGrid<nanovdb::ValueOnIndexMask>(handle, gridId, bi);
+        return nanovdbIndexGridToFVDBGrid<nanovdb::ValueOnIndexMask, nanovdb::ValueOnIndexMask>(
+            sourceGrid);
+    }
+    default:
+        // Unhandled cases include: Int16, UInt32, Fp4, Fp8, FpN
+        char gridTypeStr[nanovdb::strlen<nanovdb::GridType>()];
+        nanovdb::toStr(gridTypeStr, handle.gridType());
+        throw std::runtime_error(std::string("Grid type not supported: ") + gridTypeStr);
     }
 }
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-fromNVDB(nanovdb::GridHandle<nanovdb::HostBuffer>& handle,
+fromNVDB(nanovdb::GridHandle<nanovdb::HostBuffer>        &handle,
          const torch::optional<fvdb::TorchDeviceOrString> maybeDevice) {
-    return fromNVDB({handle}, maybeDevice);
+    return fromNVDB({ handle }, maybeDevice);
 }
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
-         const torch::optional<fvdb::TorchDeviceOrString> maybeDevice) {
+fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>> &handles,
+         const torch::optional<fvdb::TorchDeviceOrString>             maybeDevice) {
     // Load the grids, data, names, voxel origins, and sizes
-    std::vector<torch::Tensor> data;
+    std::vector<torch::Tensor>                          data;
     std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> grids;
-    std::vector<nanovdb::Vec3d> voxSizes, voxOrigins;
-    std::vector<std::string> names;
-    uint32_t bi = 0;
-    nanovdb::GridType lastGridType = nanovdb::GridType::Unknown;
+    std::vector<nanovdb::Vec3d>                         voxSizes, voxOrigins;
+    std::vector<std::string>                            names;
+    uint32_t                                            bi           = 0;
+    nanovdb::GridType                                   lastGridType = nanovdb::GridType::Unknown;
     for (size_t handleId = 0; handleId < handles.size(); handleId += 1) {
         for (size_t gridId = 0; gridId < handles[handleId].gridCount(); gridId += 1) {
             auto gridData = loadOneGrid(handles[handleId], gridId, bi);
@@ -474,11 +561,12 @@ fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
             // In all but two cases, we load a ValueOnIndex grid and a tensor of data:
             //   1. When the user saved a mutable Tensor grid with save
             //   2. When the user loaded a batch with a ValueOnIndexMask grid
-            // If the file the list of grids the user loaded contains a mix of ValueOnIndex and ValueOnIndexMask grids,
-            // then it's unclear what to do, so throw an exception.
+            // If the file the list of grids the user loaded contains a mix of ValueOnIndex and
+            // ValueOnIndexMask grids, then it's unclear what to do, so throw an exception.
             if (bi > 0) {
-                TORCH_CHECK(lastGridType == grids.back().gridData()->mGridType,
-                            "All grids in a batch must have the same mutability (i.e. all ValueOnIndex or all ValueOnIndexMask).");
+                TORCH_CHECK(
+                    lastGridType == grids.back().gridData()->mGridType,
+                    "All grids in a batch must have the same mutability (i.e. all ValueOnIndex or all ValueOnIndexMask).");
             }
             lastGridType = grids.back().gridData()->mGridType;
 
@@ -487,9 +575,11 @@ fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
     }
 
     // Merge all the loaded grids into a single handle
-    TORCH_CHECK_VALUE(grids.size() <= fvdb::GridBatch::MAX_GRIDS_PER_BATCH, "Cannot load more than ", fvdb::GridBatch::MAX_GRIDS_PER_BATCH, " grids.");
+    TORCH_CHECK_VALUE(grids.size() <= fvdb::GridBatch::MAX_GRIDS_PER_BATCH,
+                      "Cannot load more than ", fvdb::GridBatch::MAX_GRIDS_PER_BATCH, " grids.");
     nanovdb::GridHandle<TorchDeviceBuffer> resCpu = nanovdb::mergeGrids(grids);
-    c10::intrusive_ptr<GridBatchImpl> ret = c10::make_intrusive<GridBatchImpl>(std::move(resCpu), voxSizes, voxOrigins);
+    c10::intrusive_ptr<GridBatchImpl>      ret =
+        c10::make_intrusive<GridBatchImpl>(std::move(resCpu), voxSizes, voxOrigins);
 
     // Merge loaded data Tensors into a JaggedTensor
     JaggedTensor dataJagged(data);
@@ -498,7 +588,7 @@ fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
     if (maybeDevice.has_value()) {
         torch::Device toDevice = maybeDevice.value().value();
         if (toDevice != ret->device()) {
-            ret = ret->clone(toDevice);
+            ret        = ret->clone(toDevice);
             dataJagged = dataJagged.to(toDevice);
         }
     }
@@ -507,28 +597,28 @@ fromNVDB(const std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>>& handles,
 }
 
 std::tuple<GridBatch, JaggedTensor, std::vector<std::string>>
-loadNVDB(const std::string& path,
-         const NanoVDBFileGridIdentifier& gridIdentifier,
-         TorchDeviceOrString device,
-         bool verbose) {
-
+loadNVDB(const std::string &path, const NanoVDBFileGridIdentifier &gridIdentifier,
+         TorchDeviceOrString device, bool verbose) {
     // Load a std::vector of grid handles each containing a one grid to load
     // If the user specified specific indices or names of grid to load, use that as a filter
     std::vector<nanovdb::GridHandle<nanovdb::HostBuffer>> sourceHandles;
     if (gridIdentifier.specifiesIndices()) {
-        for (uint64_t index : gridIdentifier.indicesValue()) {
+        for (uint64_t index: gridIdentifier.indicesValue()) {
             try {
-                sourceHandles.emplace_back(nanovdb::io::readGrid<nanovdb::HostBuffer>(path, index, verbose));
-            } catch(std::runtime_error& e) {
+                sourceHandles.emplace_back(
+                    nanovdb::io::readGrid<nanovdb::HostBuffer>(path, index, verbose));
+            } catch (std::runtime_error &e) {
                 TORCH_CHECK_INDEX(false, "Grid id ", index, " is out of range.");
             }
         }
     } else if (gridIdentifier.specifiesNames()) {
-        for (const std::string& name : gridIdentifier.namesValue()) {
+        for (const std::string &name: gridIdentifier.namesValue()) {
             try {
-                sourceHandles.emplace_back(nanovdb::io::readGrid<nanovdb::HostBuffer>(path, name, verbose));
-            } catch(std::runtime_error& e) {
-                TORCH_CHECK_INDEX(false, "Grid with name '", name, "' not found in file '", path, "'.");
+                sourceHandles.emplace_back(
+                    nanovdb::io::readGrid<nanovdb::HostBuffer>(path, name, verbose));
+            } catch (std::runtime_error &e) {
+                TORCH_CHECK_INDEX(false, "Grid with name '", name, "' not found in file '", path,
+                                  "'.");
             }
         }
     } else {
@@ -538,8 +628,6 @@ loadNVDB(const std::string& path,
     return fromNVDB(sourceHandles, device);
 }
 
-
-
 } // namespace io
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/io/SaveNanoVDB.cpp b/fvdb/src/detail/io/SaveNanoVDB.cpp
index 9d9eb41aec..0d5c5f8ca9 100644
--- a/fvdb/src/detail/io/SaveNanoVDB.cpp
+++ b/fvdb/src/detail/io/SaveNanoVDB.cpp
@@ -1,39 +1,42 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include "detail/io/IO.h"
+#include "IO.h"
 
-#include "detail/utils/Utils.h"
-
-#include <cuda.h>
-#include <cuda_runtime_api.h>
-
-#include <torch/all.h>
+#include <detail/utils/Utils.h>
 
 #include <nanovdb/NanoVDB.h>
 #include <nanovdb/io/IO.h>
-#include <nanovdb/tools/GridBuilder.h>
 #include <nanovdb/tools/CreateNanoGrid.h>
+#include <nanovdb/tools/GridBuilder.h>
+
+#include <torch/all.h>
 
 #include <c10/cuda/CUDACachingAllocator.h>
 
+#include <cuda.h>
+#include <cuda_runtime_api.h>
 
 namespace fvdb {
 namespace detail {
 namespace io {
 
-/// @brief Copy a std::string to a char buffer with a fixed size and throw an exception if the string is too long
+/// @brief Copy a std::string to a char buffer with a fixed size and throw an exception if the
+/// string is too long
 /// @param targetBuf A pointer to the buffer to write the string to
 /// @param maxSize The maximum size of the target buffer
 /// @param sourceSting The source string to copy
 /// @param bufName A name for this string to use when throwing an exception (default is "String")
-void setFixedSizeStringBuf(char* targetBuf, size_t maxSize, std::string sourceSting, std::string bufName = "String") {
+void
+setFixedSizeStringBuf(char *targetBuf, size_t maxSize, std::string sourceSting,
+                      std::string bufName = "String") {
     memset(targetBuf, 0, maxSize);
-    TORCH_CHECK_VALUE(sourceSting.size() < maxSize, bufName + " exceeds maximum character length of " + std::to_string(maxSize) + ".");
+    TORCH_CHECK_VALUE(sourceSting.size() < maxSize, bufName +
+                                                        " exceeds maximum character length of " +
+                                                        std::to_string(maxSize) + ".");
     strncpy(targetBuf, sourceSting.c_str(), maxSize);
 }
 
-
 /// @brief Get the (row) value at index rowIdx from a tensor with 2 dimensions.
 ///        Specialized to return useful nanovdb types (e.g. Vec3f, Vec4f, etc...)
 /// @tparam ScalarT The scalar type of values
@@ -42,56 +45,66 @@ void setFixedSizeStringBuf(char* targetBuf, size_t maxSize, std::string sourceSt
 /// @param rowIdx The row to read from
 /// @return The rowIdx^th row of the tensor casted to ValueT
 template <class ScalarT, class ValueT>
-inline ValueT valueGetter(torch::TensorAccessor<ScalarT, 2>& acc, int rowIdx) {
+inline ValueT
+valueGetter(torch::TensorAccessor<ScalarT, 2> &acc, int rowIdx) {
     return acc[rowIdx][0];
 }
 template <>
-inline nanovdb::Vec3f valueGetter(torch::TensorAccessor<float, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2]};
+inline nanovdb::Vec3f
+valueGetter(torch::TensorAccessor<float, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2] };
 }
 template <>
-inline nanovdb::Vec4f valueGetter(torch::TensorAccessor<float, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3]};
+inline nanovdb::Vec4f
+valueGetter(torch::TensorAccessor<float, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3] };
 }
 template <>
-inline nanovdb::Vec3d valueGetter(torch::TensorAccessor<double, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2]};
+inline nanovdb::Vec3d
+valueGetter(torch::TensorAccessor<double, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2] };
 }
 template <>
-inline nanovdb::Vec4d valueGetter(torch::TensorAccessor<double, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3]};
+inline nanovdb::Vec4d
+valueGetter(torch::TensorAccessor<double, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3] };
 }
 template <>
-inline nanovdb::Vec3i valueGetter(torch::TensorAccessor<int32_t, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2]};
+inline nanovdb::Vec3i
+valueGetter(torch::TensorAccessor<int32_t, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2] };
 }
 template <>
-inline nanovdb::math::Rgba8 valueGetter(torch::TensorAccessor<uint8_t, 2>& acc, int rowIdx) {
-    return {acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3]};
+inline nanovdb::math::Rgba8
+valueGetter(torch::TensorAccessor<uint8_t, 2> &acc, int rowIdx) {
+    return { acc[rowIdx][0], acc[rowIdx][1], acc[rowIdx][2], acc[rowIdx][3] };
 }
 
-
-/// @brief Helper function to copy an index grid with a corresponding JaggedTensor of values to a nanovdb grid
-///        with values stored directly in the leaves. This will only work for values which correspond to valid nanovdb
-///        grid types (e.g. Vec3f, Vec4f, Vec3d, Vec4d, etc...)
-/// @tparam OutGridType The type of data to store in the returned grid (e.g. float, nanovdb::Vec3f, etc...)
-/// @tparam InScalarType The scalar type of the input jagged tensor (e.g float, double, int32_t, etc...)
+/// @brief Helper function to copy an index grid with a corresponding JaggedTensor of values to a
+/// nanovdb grid
+///        with values stored directly in the leaves. This will only work for values which
+///        correspond to valid nanovdb grid types (e.g. Vec3f, Vec4f, Vec3d, Vec4d, etc...)
+/// @tparam OutGridType The type of data to store in the returned grid (e.g. float, nanovdb::Vec3f,
+/// etc...)
+/// @tparam InScalarType The scalar type of the input jagged tensor (e.g float, double, int32_t,
+/// etc...)
 /// @param gridBatch The batch of index grids to copy
 /// @param data The JaggedTensor of data to copy
 /// @param names The names of the grids in the batch to write to the copied output (optional)
 /// @return A nanovdb grid handle with the copied data stored in the leaves
 template <typename OutGridType, typename InScalarType>
-nanovdb::GridHandle<nanovdb::HostBuffer> fvdbToNanovdbGridWithValues(const GridBatch& gridBatch,
-                                                                     const JaggedTensor& data,
-                                                                     const std::vector<std::string>& names) {
-
-    TORCH_CHECK(names.size() == 0 || names.size() == (size_t) gridBatch.grid_count(),
-                "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got "
-                + std::to_string(names.size()) + " names for batch size " + std::to_string(gridBatch.grid_count()));
+nanovdb::GridHandle<nanovdb::HostBuffer>
+fvdbToNanovdbGridWithValues(const GridBatch &gridBatch, const JaggedTensor &data,
+                            const std::vector<std::string> &names) {
+    TORCH_CHECK(
+        names.size() == 0 || names.size() == (size_t)gridBatch.grid_count(),
+        "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got " +
+            std::to_string(names.size()) + " names for batch size " +
+            std::to_string(gridBatch.grid_count()));
     TORCH_CHECK(!gridBatch.is_mutable(), "Need to use indexing with mutable grids!");
 
-    using ProxyGridT = nanovdb::tools::build::Grid<OutGridType>;
-    using GridValueT = typename ProxyGridT::ValueType;
+    using ProxyGridT     = nanovdb::tools::build::Grid<OutGridType>;
+    using GridValueT     = typename ProxyGridT::ValueType;
     using HostGridHandle = nanovdb::GridHandle<nanovdb::HostBuffer>;
 
     // We'll build each grid from the ijk values and data, so get accessors for these
@@ -105,32 +118,39 @@ nanovdb::GridHandle<nanovdb::HostBuffer> fvdbToNanovdbGridWithValues(const GridB
     if (jdataCpu.ndimension() == 1) {
         jdataCpu = jdataCpu.unsqueeze(1);
     }
-    TORCH_CHECK(jdataCpu.size(0) == gridBatch.total_voxels(), "Invalid data tensor size. Must match number of voxels in grid batch.");
+    TORCH_CHECK(jdataCpu.size(0) == gridBatch.total_voxels(),
+                "Invalid data tensor size. Must match number of voxels in grid batch.");
 
-    auto ijkAccessor = ijkValues.jdata().accessor<int, 2>();
+    auto ijkAccessor   = ijkValues.jdata().accessor<int, 2>();
     auto jdataAccessor = jdataCpu.accessor<InScalarType, 2>();
 
     // Populate a vector of host buffers for each grid in the batch
     std::vector<HostGridHandle> buffers(gridBatch.grid_count());
     for (int64_t bi = 0; bi < gridBatch.grid_count(); bi += 1) {
         const std::string name = names.size() > 0 ? names[bi] : "";
-        TORCH_CHECK_VALUE(name.size() < nanovdb::GridData::MaxNameSize, "Grid name " + name + " exceeds maximum character length of " + std::to_string(nanovdb::GridData::MaxNameSize) + ".");
+        TORCH_CHECK_VALUE(name.size() < nanovdb::GridData::MaxNameSize,
+                          "Grid name " + name + " exceeds maximum character length of " +
+                              std::to_string(nanovdb::GridData::MaxNameSize) + ".");
 
-        auto proxyGrid = std::make_shared<ProxyGridT>(GridValueT(0), name);
+        auto proxyGrid         = std::make_shared<ProxyGridT>(GridValueT(0), name);
         auto proxyGridAccessor = proxyGrid->getWriteAccessor();
 
-        const int start = ijkValues.joffsets()[bi].item<int>();
-        const int end = ijkValues.joffsets()[bi+1].item<int>();
+        const int     start     = ijkValues.joffsets()[bi].item<int>();
+        const int     end       = ijkValues.joffsets()[bi + 1].item<int>();
         const int64_t numVoxels = end - start;
-        const int64_t numData = data.joffsets()[bi+1].item<int>() - data.joffsets()[bi].item<int>();
+        const int64_t numData =
+            data.joffsets()[bi + 1].item<int>() - data.joffsets()[bi].item<int>();
         TORCH_CHECK_VALUE(numData == gridBatch.num_voxels_at(bi),
-                          "Invalid number of voxels in jagged tensor at index " + std::to_string(bi) +
-                          ". Expected it to match the number of voxels at grid index " + std::to_string(bi) + ". " +
-                          "Got " + std::to_string(numVoxels) + " but expected " +
-                          std::to_string(gridBatch.num_voxels_at(bi)) + ".");
+                          "Invalid number of voxels in jagged tensor at index " +
+                              std::to_string(bi) +
+                              ". Expected it to match the number of voxels at grid index " +
+                              std::to_string(bi) + ". " + "Got " + std::to_string(numVoxels) +
+                              " but expected " + std::to_string(gridBatch.num_voxels_at(bi)) + ".");
         for (int i = 0; i < numVoxels; i += 1) {
-            const GridValueT& value = valueGetter<InScalarType, GridValueT>(jdataAccessor, start + i);
-            const nanovdb::Coord ijk(ijkAccessor[start + i][0], ijkAccessor[start + i][1], ijkAccessor[start + i][2]);
+            const GridValueT &value =
+                valueGetter<InScalarType, GridValueT>(jdataAccessor, start + i);
+            const nanovdb::Coord ijk(ijkAccessor[start + i][0], ijkAccessor[start + i][1],
+                                     ijkAccessor[start + i][2]);
             proxyGridAccessor.setValue(ijk, value);
         }
         proxyGridAccessor.merge();
@@ -138,22 +158,21 @@ nanovdb::GridHandle<nanovdb::HostBuffer> fvdbToNanovdbGridWithValues(const GridB
         // Write shape of tensor to blind data so we can load it back with the same shape
         // This lets us handle things like (N, 1, 3) tensors which we can save as Vec3f grids
         nanovdb::tools::CreateNanoGrid<ProxyGridT> converter(*proxyGrid);
-        converter.addBlindData("fvdb_jdata",
-                               nanovdb::GridBlindDataSemantic::Unknown,
-                               nanovdb::GridBlindDataClass::Unknown,
-                               nanovdb::GridType::Unknown,
-                               data.jdata().dim() + 1,
-                               sizeof(int64_t));
-        buffers.push_back(converter.template getHandle<OutGridType, nanovdb::HostBuffer>(nanovdb::HostBuffer()));
+        converter.addBlindData("fvdb_jdata", nanovdb::GridBlindDataSemantic::Unknown,
+                               nanovdb::GridBlindDataClass::Unknown, nanovdb::GridType::Unknown,
+                               data.jdata().dim() + 1, sizeof(int64_t));
+        buffers.push_back(
+            converter.template getHandle<OutGridType, nanovdb::HostBuffer>(nanovdb::HostBuffer()));
         TORCH_CHECK(buffers.back().gridCount() == 1, "Internal error. Invalid grid count.");
-        nanovdb::NanoGrid<OutGridType>* nanoGrid = buffers.back().grid<OutGridType>();
-        TORCH_CHECK(nanoGrid->blindDataCount() == 1, "Internal error. Invalid blind metadata count.");
-        int64_t* writeHead = (int64_t*) nanoGrid->blindMetaData(0).blindData();
-        JaggedTensor dataBi = data.index({bi});
-        *writeHead = (int64_t) dataBi.jdata().dim();
+        nanovdb::NanoGrid<OutGridType> *nanoGrid = buffers.back().grid<OutGridType>();
+        TORCH_CHECK(nanoGrid->blindDataCount() == 1,
+                    "Internal error. Invalid blind metadata count.");
+        int64_t     *writeHead = (int64_t *)nanoGrid->blindMetaData(0).blindData();
+        JaggedTensor dataBi    = data.index({ bi });
+        *writeHead             = (int64_t)dataBi.jdata().dim();
         writeHead += 1;
         for (int di = 0; di < dataBi.jdata().dim(); di += 1) {
-            *writeHead = (int64_t) dataBi.jdata().size(di);
+            *writeHead = (int64_t)dataBi.jdata().size(di);
             writeHead += 1;
         }
     }
@@ -166,11 +185,11 @@ nanovdb::GridHandle<nanovdb::HostBuffer> fvdbToNanovdbGridWithValues(const GridB
     }
 }
 
-nanovdb::GridHandle<nanovdb::HostBuffer> maybeConvertToStandardNanovdbGrid(const fvdb::GridBatch& gridBatch,
-                                                                           const fvdb::JaggedTensor data,
-                                                                           const std::vector<std::string> names)
-{
-    // We can't convert mutable grids to a standard format because we don't know what do with disabled voxels
+nanovdb::GridHandle<nanovdb::HostBuffer>
+maybeConvertToStandardNanovdbGrid(const fvdb::GridBatch &gridBatch, const fvdb::JaggedTensor data,
+                                  const std::vector<std::string> names) {
+    // We can't convert mutable grids to a standard format because we don't know what do with
+    // disabled voxels
     if (gridBatch.is_mutable()) {
         return nanovdb::GridHandle<nanovdb::HostBuffer>();
     }
@@ -178,9 +197,11 @@ nanovdb::GridHandle<nanovdb::HostBuffer> maybeConvertToStandardNanovdbGrid(const
     // Get a squeezed view of the tensor so we can save data with redundant dimensions
     // (e.g. shape (N, 1, 3) can get saved as a Vec3f grid)
     torch::Tensor jdataSqueezed = data.jdata().squeeze();
-    if (jdataSqueezed.numel() == 1 && jdataSqueezed.dim() == 0) {  // Make sure we have at least 1 dimension
+    if (jdataSqueezed.numel() == 1 &&
+        jdataSqueezed.dim() == 0) { // Make sure we have at least 1 dimension
         jdataSqueezed = jdataSqueezed.unsqueeze(0);
-        TORCH_CHECK(jdataSqueezed.ndimension() == 1, "Internal error: Invalid jdata shape when saving grid.");
+        TORCH_CHECK(jdataSqueezed.ndimension() == 1,
+                    "Internal error: Invalid jdata shape when saving grid.");
     }
     if (data.dtype() == torch::kHalf) {
         if (jdataSqueezed.dim() == 1 || (jdataSqueezed.dim() == 2 && jdataSqueezed.size(1) == 1)) {
@@ -232,16 +253,13 @@ nanovdb::GridHandle<nanovdb::HostBuffer> maybeConvertToStandardNanovdbGrid(const
     return nanovdb::GridHandle<nanovdb::HostBuffer>();
 }
 
-bool maybeSaveStandardNanovdbGrid(const std::string& path,
-                                  const GridBatch& gridBatch,
-                                  const JaggedTensor data,
-                                  const std::vector<std::string> names,
-                                  nanovdb::io::Codec codec,
-                                  bool verbose) {
-
-    nanovdb::GridHandle<nanovdb::HostBuffer> gridHandle = maybeConvertToStandardNanovdbGrid(gridBatch, data, names);
-    if (gridHandle.isEmpty())
-    {
+bool
+maybeSaveStandardNanovdbGrid(const std::string &path, const GridBatch &gridBatch,
+                             const JaggedTensor data, const std::vector<std::string> names,
+                             nanovdb::io::Codec codec, bool verbose) {
+    nanovdb::GridHandle<nanovdb::HostBuffer> gridHandle =
+        maybeConvertToStandardNanovdbGrid(gridBatch, data, names);
+    if (gridHandle.isEmpty()) {
         return false;
     }
 
@@ -249,48 +267,54 @@ bool maybeSaveStandardNanovdbGrid(const std::string& path,
     return true;
 }
 
-nanovdb::GridHandle<nanovdb::HostBuffer> getIndexGrid(const GridBatch& gridBatch,
-                                                      const std::vector<std::string> names = {}) {
-
-    const nanovdb::GridHandle<TorchDeviceBuffer>& nanoGridHdl = gridBatch.nanovdb_grid_handle();
+nanovdb::GridHandle<nanovdb::HostBuffer>
+getIndexGrid(const GridBatch &gridBatch, const std::vector<std::string> names = {}) {
+    const nanovdb::GridHandle<TorchDeviceBuffer> &nanoGridHdl = gridBatch.nanovdb_grid_handle();
 
     // Allocate memory and get pointer to host grid buffer
     nanovdb::HostBuffer writeBuf(nanoGridHdl.buffer().size());
-    void* writeHead = writeBuf.data();
+    void               *writeHead = writeBuf.data();
 
     // Get pointer to grid read from (possibly on the device)
-    const bool isCuda = nanoGridHdl.buffer().device().is_cuda();
-    void* readHead = isCuda ? nanoGridHdl.buffer().deviceData() : nanoGridHdl.buffer().data();
+    const bool isCuda   = nanoGridHdl.buffer().device().is_cuda();
+    void      *readHead = isCuda ? nanoGridHdl.buffer().deviceData() : nanoGridHdl.buffer().data();
     const size_t sourceGridByteSize = nanoGridHdl.buffer().size();
 
     // Write out the full grid to the buffer
     if (isCuda) {
-        at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(gridBatch.device().index());
-        cudaMemcpyAsync(writeHead, readHead, sourceGridByteSize, cudaMemcpyDeviceToHost, defaultStream.stream());
+        at::cuda::CUDAStream defaultStream =
+            at::cuda::getCurrentCUDAStream(gridBatch.device().index());
+        cudaMemcpyAsync(writeHead, readHead, sourceGridByteSize, cudaMemcpyDeviceToHost,
+                        defaultStream.stream());
         cudaStreamSynchronize(defaultStream.stream());
     } else {
         memcpy(writeHead, readHead, sourceGridByteSize);
     }
 
-    nanovdb::GridHandle<nanovdb::HostBuffer> retHandle = nanovdb::GridHandle<nanovdb::HostBuffer> (std::move(writeBuf));
+    nanovdb::GridHandle<nanovdb::HostBuffer> retHandle =
+        nanovdb::GridHandle<nanovdb::HostBuffer>(std::move(writeBuf));
 
     // Write voxelSize and origin information to the output buffer
-    for (int64_t bi = 0; bi < gridBatch.grid_count(); bi += 1)
-    {
+    for (int64_t bi = 0; bi < gridBatch.grid_count(); bi += 1) {
         nanovdb::GridData *retGridData = (nanovdb::GridData *)(retHandle.gridData(bi));
-        torch::Tensor voxelSize = gridBatch.voxel_size_at(bi, torch::kFloat64);
-        torch::Tensor origin = gridBatch.origin_at(bi, torch::kFloat64);
-        retGridData->mVoxelSize = {voxelSize[0].item<double>(), voxelSize[1].item<double>(), voxelSize[2].item<double>()};
-        retGridData->mMap = nanovdb::Map(voxelSize[0].item<double>(), {origin[0].item<double>(), origin[1].item<double>(), origin[2].item<double>()});
+        torch::Tensor      voxelSize   = gridBatch.voxel_size_at(bi, torch::kFloat64);
+        torch::Tensor      origin      = gridBatch.origin_at(bi, torch::kFloat64);
+        retGridData->mVoxelSize        = { voxelSize[0].item<double>(), voxelSize[1].item<double>(),
+                                           voxelSize[2].item<double>() };
+        retGridData->mMap              = nanovdb::Map(
+            voxelSize[0].item<double>(),
+            { origin[0].item<double>(), origin[1].item<double>(), origin[2].item<double>() });
     }
 
     // If you passed in grid names, write them to the output buffer
     if (names.size() > 0) {
         for (int64_t bi = 0; bi < gridBatch.grid_count(); bi += 1) {
             const std::string name = names.size() > 0 ? names[bi] : "";
-            TORCH_CHECK_VALUE(name.size() < nanovdb::GridData::MaxNameSize, "Grid name " + name + " exceeds maximum character length of " + std::to_string(nanovdb::GridData::MaxNameSize) + ".");
-            nanovdb::GridData* retGridData = (nanovdb::GridData*) (retHandle.gridData(bi));
-            #pragma GCC diagnostic ignored "-Wstringop-truncation"
+            TORCH_CHECK_VALUE(name.size() < nanovdb::GridData::MaxNameSize,
+                              "Grid name " + name + " exceeds maximum character length of " +
+                                  std::to_string(nanovdb::GridData::MaxNameSize) + ".");
+            nanovdb::GridData *retGridData = (nanovdb::GridData *)(retHandle.gridData(bi));
+#pragma GCC diagnostic ignored "-Wstringop-truncation"
             strncpy(retGridData->mGridName, names[bi].c_str(), nanovdb::GridData::MaxNameSize);
         }
     }
@@ -299,12 +323,9 @@ nanovdb::GridHandle<nanovdb::HostBuffer> getIndexGrid(const GridBatch& gridBatch
     return retHandle;
 }
 
-void saveIndexGrid(const std::string& path,
-                   const GridBatch& gridBatch,
-                   const std::vector<std::string> names,
-                   nanovdb::io::Codec codec,
-                   bool verbose) {
-
+void
+saveIndexGrid(const std::string &path, const GridBatch &gridBatch,
+              const std::vector<std::string> names, nanovdb::io::Codec codec, bool verbose) {
     // If you don't pass in data, then we just write the grid
     nanovdb::GridHandle<nanovdb::HostBuffer> writeHandle = getIndexGrid(gridBatch, names);
 
@@ -312,32 +333,31 @@ void saveIndexGrid(const std::string& path,
     nanovdb::io::writeGrid(path, writeHandle, codec, verbose);
 }
 
-void saveIndexGridWithBlindData(const std::string& path,
-                                const GridBatch& gridBatch,
-                                const JaggedTensor data,
-                                const std::vector<std::string> names,
-                                nanovdb::io::Codec codec,
-                                bool verbose) {
-
-    const nanovdb::GridHandle<TorchDeviceBuffer>& nanoGridHdl = gridBatch.nanovdb_grid_handle();
+void
+saveIndexGridWithBlindData(const std::string &path, const GridBatch &gridBatch,
+                           const JaggedTensor data, const std::vector<std::string> names,
+                           nanovdb::io::Codec codec, bool verbose) {
+    const nanovdb::GridHandle<TorchDeviceBuffer> &nanoGridHdl = gridBatch.nanovdb_grid_handle();
 
     // Make a (possible) cpu copy of the data jagged tensor
     JaggedTensor cpuData = data.cpu().contiguous();
 
     // Compute blind data sizes padded to be 32 byte aligned
-    std::vector<uint64_t> blindDataPadding;  // Size of each blind data padded to 32 bytes
-    std::vector<uint64_t> paddedBlindDataSizes;  // The amount of padding added to each blind data to achieve 32 byte alignment
+    std::vector<uint64_t> blindDataPadding;     // Size of each blind data padded to 32 bytes
+    std::vector<uint64_t> paddedBlindDataSizes; // The amount of padding added to each blind data to
+                                                // achieve 32 byte alignment
     uint64_t totalBlindDataSize = 0;
     for (int bi = 0; bi < gridBatch.grid_count(); bi += 1) {
-        JaggedTensor dataBi = cpuData.index({bi});
-        const int64_t numVoxelsBi = gridBatch.num_voxels_at(bi);
+        JaggedTensor  dataBi       = cpuData.index({ bi });
+        const int64_t numVoxelsBi  = gridBatch.num_voxels_at(bi);
         const int64_t jdataBytesBi = dataBi.jdata().numel() * dataBi.jdata().element_size();
-        TORCH_CHECK_VALUE(numVoxelsBi == dataBi.rsize(0),
-                          "Invalid number of voxels in jagged tensor at index " + std::to_string(bi) +
-                          ". Expected it to match the number of voxels at grid index " + std::to_string(bi) + ". " +
-                          "Got " + std::to_string(dataBi.jdata().size(0)) + " but expected " +
-                          std::to_string(gridBatch.num_voxels_at(bi)) + ".");
-        const uint64_t blindDataSizeBi = jdataBytesBi + sizeof(int64_t) * (dataBi.rdim() + 1);
+        TORCH_CHECK_VALUE(
+            numVoxelsBi == dataBi.rsize(0),
+            "Invalid number of voxels in jagged tensor at index " + std::to_string(bi) +
+                ". Expected it to match the number of voxels at grid index " + std::to_string(bi) +
+                ". " + "Got " + std::to_string(dataBi.jdata().size(0)) + " but expected " +
+                std::to_string(gridBatch.num_voxels_at(bi)) + ".");
+        const uint64_t blindDataSizeBi       = jdataBytesBi + sizeof(int64_t) * (dataBi.rdim() + 1);
         const uint64_t paddedBlindDataSizeBi = nanovdb::math::AlignUp<32UL>(blindDataSizeBi);
         blindDataPadding.push_back(paddedBlindDataSizeBi - blindDataSizeBi);
         paddedBlindDataSizes.push_back(paddedBlindDataSizeBi);
@@ -345,61 +365,71 @@ void saveIndexGridWithBlindData(const std::string& path,
     }
 
     // Allocate a big enough buffer to allocate the index grid and blind data
-    const size_t allocSize = nanoGridHdl.buffer().size() +                                 // Grids (32B aligned)
-                             sizeof(nanovdb::GridBlindMetaData) * gridBatch.grid_count() + // Blind metadata (32B aligned)
-                             totalBlindDataSize;                                           // Blind data (32B aligned)
+    const size_t allocSize = nanoGridHdl.buffer().size() + // Grids (32B aligned)
+                             sizeof(nanovdb::GridBlindMetaData) *
+                                 gridBatch.grid_count() +  // Blind metadata (32B aligned)
+                             totalBlindDataSize;           // Blind data (32B aligned)
     nanovdb::HostBuffer writeBuf(allocSize);
 
     // Get pointer to read (possibly on the device) and write pointers
-    const bool isCuda = nanoGridHdl.buffer().device().is_cuda();
-    uint8_t* writeHead = static_cast<uint8_t*>(writeBuf.data());
-    uint8_t* readHead = static_cast<uint8_t*>(isCuda ? nanoGridHdl.buffer().deviceData() : nanoGridHdl.buffer().data());
+    const bool isCuda    = nanoGridHdl.buffer().device().is_cuda();
+    uint8_t   *writeHead = static_cast<uint8_t *>(writeBuf.data());
+    uint8_t   *readHead  = static_cast<uint8_t *>(isCuda ? nanoGridHdl.buffer().deviceData()
+                                                         : nanoGridHdl.buffer().data());
 
     // Copy each grid and each entry in the jagged tensor
     for (int bi = 0; bi < gridBatch.grid_count(); bi += 1) {
-
         // Copy the full bi^th index grid to the buffer
         const size_t sourceGridByteSize = nanoGridHdl.gridSize(bi);
         if (isCuda) {
-            at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(gridBatch.device().index());
-            cudaMemcpyAsync((void*) writeHead, (void*) readHead, sourceGridByteSize, cudaMemcpyDeviceToHost, defaultStream.stream());
+            at::cuda::CUDAStream defaultStream =
+                at::cuda::getCurrentCUDAStream(gridBatch.device().index());
+            cudaMemcpyAsync((void *)writeHead, (void *)readHead, sourceGridByteSize,
+                            cudaMemcpyDeviceToHost, defaultStream.stream());
         } else {
-            memcpy((void*) writeHead, (void*) readHead, sourceGridByteSize);
+            memcpy((void *)writeHead, (void *)readHead, sourceGridByteSize);
         }
         // Update the metadata for the copied grid in the buffer to be a tensor grid with blind data
-        nanovdb::GridData* writeGridData = reinterpret_cast<nanovdb::GridData*>(writeHead);
-        writeGridData->mGridClass = nanovdb::GridClass::TensorGrid;
-        writeGridData->mGridType = gridBatch.is_mutable() ? nanovdb::GridType::OnIndexMask : nanovdb::GridType::OnIndex;
-        writeGridData->mBlindMetadataCount = 1;
+        nanovdb::GridData *writeGridData = reinterpret_cast<nanovdb::GridData *>(writeHead);
+        writeGridData->mGridClass        = nanovdb::GridClass::TensorGrid;
+        writeGridData->mGridType =
+            gridBatch.is_mutable() ? nanovdb::GridType::OnIndexMask : nanovdb::GridType::OnIndex;
+        writeGridData->mBlindMetadataCount  = 1;
         writeGridData->mBlindMetadataOffset = sourceGridByteSize;
-        const std::string name = names.size() > 0 ? names[bi] : "";
-        setFixedSizeStringBuf(writeGridData->mGridName, nanovdb::GridData::MaxNameSize, name, "Grid name " + name);
-        writeGridData->mGridSize = sourceGridByteSize + sizeof(nanovdb::GridBlindMetaData) + paddedBlindDataSizes[bi];
+        const std::string name              = names.size() > 0 ? names[bi] : "";
+        setFixedSizeStringBuf(writeGridData->mGridName, nanovdb::GridData::MaxNameSize, name,
+                              "Grid name " + name);
+        writeGridData->mGridSize =
+            sourceGridByteSize + sizeof(nanovdb::GridBlindMetaData) + paddedBlindDataSizes[bi];
         readHead += sourceGridByteSize;
         writeHead += sourceGridByteSize;
 
         // Write out blind metadata to the end of the grid
-        nanovdb::GridBlindMetaData* blindMetadata = reinterpret_cast<nanovdb::GridBlindMetaData*>(writeHead);
+        nanovdb::GridBlindMetaData *blindMetadata =
+            reinterpret_cast<nanovdb::GridBlindMetaData *>(writeHead);
         blindMetadata->mDataOffset = int64_t(sizeof(nanovdb::GridBlindMetaData));
         blindMetadata->mValueCount = paddedBlindDataSizes[bi]; // Number of bytes
-        blindMetadata->mValueSize = 1;                         // 1 byte per value
-        blindMetadata->mSemantic = nanovdb::GridBlindDataSemantic::Unknown;
-        blindMetadata->mDataClass = nanovdb::GridBlindDataClass::Unknown;
-        blindMetadata->mDataType = nanovdb::GridType::Unknown;
-        const std::string fvdbBlindName = "fvdb_jdata" + TorchScalarTypeToStr(cpuData.scalar_type());
-        setFixedSizeStringBuf(blindMetadata->mName, nanovdb::GridBlindMetaData::MaxNameSize, fvdbBlindName, "blind metadata name");
+        blindMetadata->mValueSize  = 1;                        // 1 byte per value
+        blindMetadata->mSemantic   = nanovdb::GridBlindDataSemantic::Unknown;
+        blindMetadata->mDataClass  = nanovdb::GridBlindDataClass::Unknown;
+        blindMetadata->mDataType   = nanovdb::GridType::Unknown;
+        const std::string fvdbBlindName =
+            "fvdb_jdata" + TorchScalarTypeToStr(cpuData.scalar_type());
+        setFixedSizeStringBuf(blindMetadata->mName, nanovdb::GridBlindMetaData::MaxNameSize,
+                              fvdbBlindName, "blind metadata name");
         TORCH_CHECK(blindMetadata->isValid(), "Invalid blind metadata");
         writeHead += sizeof(nanovdb::GridBlindMetaData);
 
         // i^th jdata entry in the jagged tensor
-        JaggedTensor dataBi = cpuData.index({bi});
+        JaggedTensor dataBi = cpuData.index({ bi });
         TORCH_CHECK(dataBi.is_contiguous(), "Jagged tensor must be contiguous");
 
-        // Write the shape of bi^th jdata tensor so we can load it with the same shape it was saved with
-        *reinterpret_cast<int64_t*>(writeHead) = (int64_t) dataBi.rdim();
+        // Write the shape of bi^th jdata tensor so we can load it with the same shape it was saved
+        // with
+        *reinterpret_cast<int64_t *>(writeHead) = (int64_t)dataBi.rdim();
         writeHead += sizeof(int64_t);
         for (int di = 0; di < dataBi.rdim(); di += 1) {
-            *reinterpret_cast<int64_t*>(writeHead) = (int64_t) dataBi.rsize(di);
+            *reinterpret_cast<int64_t *>(writeHead) = (int64_t)dataBi.rsize(di);
             writeHead += sizeof(int64_t);
         }
 
@@ -407,14 +437,15 @@ void saveIndexGridWithBlindData(const std::string& path,
         const int64_t jdataSize = dataBi.jdata().numel() * dataBi.jdata().element_size();
         TORCH_CHECK(dataBi.jdata().is_contiguous(), "Jagged tensor must be contiguous");
         TORCH_CHECK(dataBi.device().is_cpu(), "Jagged tensor must be on CPU");
-        memcpy((void*) writeHead, (void*) dataBi.jdata().data_ptr(), jdataSize);
+        memcpy((void *)writeHead, (void *)dataBi.jdata().data_ptr(), jdataSize);
         writeHead += jdataSize;
-        writeHead += blindDataPadding[bi];  // Add padding to make sure we're 32 byte aligned
+        writeHead += blindDataPadding[bi]; // Add padding to make sure we're 32 byte aligned
     }
 
     // Synchronize cuda stream if we just did a bunch of GPU -> CPU transfers
     if (isCuda) {
-        at::cuda::CUDAStream defaultStream = at::cuda::getCurrentCUDAStream(gridBatch.device().index());
+        at::cuda::CUDAStream defaultStream =
+            at::cuda::getCurrentCUDAStream(gridBatch.device().index());
         cudaStreamSynchronize(defaultStream.stream());
     }
 
@@ -424,36 +455,30 @@ void saveIndexGridWithBlindData(const std::string& path,
 }
 
 nanovdb::GridHandle<nanovdb::HostBuffer>
-toNVDB(const GridBatch& gridBatch,
-       const torch::optional<JaggedTensor> maybeData,
+toNVDB(const GridBatch &gridBatch, const torch::optional<JaggedTensor> maybeData,
        const torch::optional<StringOrListOfStrings> maybeNames) {
-
     // Get optional names
     std::vector<std::string> names;
-    if (maybeNames.has_value())
-    {
+    if (maybeNames.has_value()) {
         names = maybeNames.value().value();
-        TORCH_CHECK_VALUE(names.size() == 0 || names.size() == (size_t)gridBatch.grid_count(),
-                          "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got " + std::to_string(names.size()) + " names for batch size " + std::to_string(gridBatch.grid_count()));
+        TORCH_CHECK_VALUE(
+            names.size() == 0 || names.size() == (size_t)gridBatch.grid_count(),
+            "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got " +
+                std::to_string(names.size()) + " names for batch size " +
+                std::to_string(gridBatch.grid_count()));
     }
 
-    if (maybeData.has_value())
-    {
+    if (maybeData.has_value()) {
         return maybeConvertToStandardNanovdbGrid(gridBatch, maybeData.value(), names);
-    }
-    else
-    {
+    } else {
         return getIndexGrid(gridBatch, names);
     }
 }
 
-void saveNVDB(const std::string& path,
-              const GridBatch& gridBatch,
-              const torch::optional<JaggedTensor> maybeData,
-              const torch::optional<StringOrListOfStrings> maybeNames,
-              bool compressed,
-              bool verbose) {
-
+void
+saveNVDB(const std::string &path, const GridBatch &gridBatch,
+         const torch::optional<JaggedTensor>          maybeData,
+         const torch::optional<StringOrListOfStrings> maybeNames, bool compressed, bool verbose) {
     // Which Codec to use for saving
     nanovdb::io::Codec codec = compressed ? nanovdb::io::Codec::BLOSC : nanovdb::io::Codec::NONE;
 
@@ -461,9 +486,11 @@ void saveNVDB(const std::string& path,
     std::vector<std::string> names;
     if (maybeNames.has_value()) {
         names = maybeNames.value().value();
-        TORCH_CHECK_VALUE(names.size() == 0 || names.size() == (size_t) gridBatch.grid_count(),
-                          "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got "
-                          + std::to_string(names.size()) + " names for batch size " + std::to_string(gridBatch.grid_count()));
+        TORCH_CHECK_VALUE(
+            names.size() == 0 || names.size() == (size_t)gridBatch.grid_count(),
+            "Invalid parameter for names, must be empty or a list of the same length as the batch size. Got " +
+                std::to_string(names.size()) + " names for batch size " +
+                std::to_string(gridBatch.grid_count()));
     }
 
     JaggedTensor data;
@@ -475,20 +502,23 @@ void saveNVDB(const std::string& path,
     }
 
     TORCH_CHECK_VALUE(data.jdata().ndimension() >= 1, "Invalid jagged data shape in save_nvdb");
-    TORCH_CHECK_VALUE(gridBatch.total_voxels() == data.jdata().size(0), "Invalid jagged data shape in save_nvdb. Must match number of voxels");
-    TORCH_CHECK_VALUE(gridBatch.device() == data.device(), "Device should match between grid batch and data");
-
-    // Heuristically determine if we can use a standard nanovdb grid (e.g. vec3f, float, vec3i, etc...) to store the data
-    // If so, we save such a grid -- otherwise we save an index grid with custom blind data
+    TORCH_CHECK_VALUE(gridBatch.total_voxels() == data.jdata().size(0),
+                      "Invalid jagged data shape in save_nvdb. Must match number of voxels");
+    TORCH_CHECK_VALUE(gridBatch.device() == data.device(),
+                      "Device should match between grid batch and data");
+
+    // Heuristically determine if we can use a standard nanovdb grid (e.g. vec3f, float, vec3i,
+    // etc...) to store the data If so, we save such a grid -- otherwise we save an index grid with
+    // custom blind data
     if (maybeSaveStandardNanovdbGrid(path, gridBatch, data, names, codec, verbose)) {
         return;
     } else {
-        // If we didn't manage to save a standard nanovdb grid, just save a tensor grid with blind data
+        // If we didn't manage to save a standard nanovdb grid, just save a tensor grid with blind
+        // data
         saveIndexGridWithBlindData(path, gridBatch, data, names, codec, verbose);
     }
 }
 
-
 } // namespace io
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/ActiveGridGoords.cu b/fvdb/src/detail/ops/ActiveGridGoords.cu
index 598a2525d0..bf02e49593 100644
--- a/fvdb/src/detail/ops/ActiveGridGoords.cu
+++ b/fvdb/src/detail/ops/ActiveGridGoords.cu
@@ -1,117 +1,131 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 /// @brief Per-voxel callback for getting the enabled grid coordinates in a batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void enabledGridCoordsVoxelCallback(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx,
-                                                       GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                       TorchAccessor<int64_t, 1> leafBaseOffset,
-                                                       TorchAccessor<int32_t, 2> outGridCoords) {
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
-    const nanovdb::Coord& ijk = leaf.offsetToGlobalCoord(voxelIdx);
-    const int64_t outIdx = leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
+__hostdev__ inline void
+enabledGridCoordsVoxelCallback(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx,
+                               GridBatchImpl::Accessor<GridType> gridAccessor,
+                               TorchAccessor<int64_t, 1>         leafBaseOffset,
+                               TorchAccessor<int32_t, 2>         outGridCoords) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
+    const nanovdb::Coord &ijk = leaf.offsetToGlobalCoord(voxelIdx);
+    const int64_t         outIdx =
+        leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
     if (leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx)) {
         outGridCoords[outIdx][0] = ijk[0];
         outGridCoords[outIdx][1] = ijk[1];
         outGridCoords[outIdx][2] = ijk[2];
     }
-
 }
 
-
 /// @brief Per-voxel callback which computes the active grid coordinates for a batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void activeGridCoordsVoxelCallback(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx,
-                                                      GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                      TorchAccessor<int32_t, 2> outGridCoords) {
-
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+activeGridCoordsVoxelCallback(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx,
+                              GridBatchImpl::Accessor<GridType> gridAccessor,
+                              TorchAccessor<int32_t, 2>         outGridCoords) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     const int64_t baseOffset = gridAccessor.voxelOffset(batchIdx);
 
-
-    const nanovdb::Coord& ijk = leaf.offsetToGlobalCoord(voxelIdx);
+    const nanovdb::Coord &ijk = leaf.offsetToGlobalCoord(voxelIdx);
     if (leaf.isActive(voxelIdx)) {
-        const int64_t idx = baseOffset + (int64_t) leaf.getValue(voxelIdx) - 1;
+        const int64_t idx     = baseOffset + (int64_t)leaf.getValue(voxelIdx) - 1;
         outGridCoords[idx][0] = ijk[0];
         outGridCoords[idx][1] = ijk[1];
         outGridCoords[idx][2] = ijk[2];
     }
 }
 
-
 /// @brief Get the enabled grid coordinates for a batch of grids (ignoring disabled voxels)
 /// @param gridBatch The batch of grids (must be mutable)
 /// @param outGridCoords Tensor which will contain the output grid coordinates
 template <c10::DeviceType DeviceTag>
-void GetEnabledGridCoords(const GridBatchImpl& gridBatch, torch::Tensor& outGridCoords) {
+void
+GetEnabledGridCoords(const GridBatchImpl &gridBatch, torch::Tensor &outGridCoords) {
     using GridType = nanovdb::ValueOnIndexMask;
 
     // Compute a prefix sum of the unmasked voxels per leaf
-    const torch::Tensor leafBaseOffset = countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
+    const torch::Tensor leafBaseOffset =
+        countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
 
     // Get the unmasked grid coordinates
     auto leafBaseOffsetAcc = tensorAccessor<DeviceTag, int64_t, 1>(leafBaseOffset);
-    auto outCoordsAcc = tensorAccessor<DeviceTag, int32_t, 2>(outGridCoords);
+    auto outCoordsAcc      = tensorAccessor<DeviceTag, int32_t, 2>(outGridCoords);
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__ (int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridCoordsVoxelCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outCoordsAcc);
+        auto cb = [=] __device__(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t,
+                                 GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridCoordsVoxelCallback<GridType, TorchRAcc32>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outCoordsAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=] (int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridCoordsVoxelCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outCoordsAcc);
+        auto cb = [=](int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t,
+                      GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridCoordsVoxelCallback<GridType, TorchAcc>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outCoordsAcc);
         };
         forEachVoxelCPU<GridType>(1, gridBatch, cb);
     }
 }
 
-
-/// @brief Get the active grid coordinates for a batch of grids (including disabled coordinates in mutable grids)
+/// @brief Get the active grid coordinates for a batch of grids (including disabled coordinates in
+/// mutable grids)
 /// @tparam GridType The type of the grid (one of ValueOnIndex, ValueOnIndexMask)
 /// @param gridBatch The batch of grids
 /// @param outGridCoords Tensor which will contain the output grid coordinates
 template <c10::DeviceType DeviceTag, typename GridType>
-void GetActiveGridCoords(const GridBatchImpl& gridBatch, torch::Tensor& outGridCoords) {
+void
+GetActiveGridCoords(const GridBatchImpl &gridBatch, torch::Tensor &outGridCoords) {
     auto outCoordsAcc = tensorAccessor<DeviceTag, int32_t, 2>(outGridCoords);
 
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__ (int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            activeGridCoordsVoxelCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, outCoordsAcc);
+        auto cb = [=] __device__(int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t,
+                                 GridBatchImpl::Accessor<GridType> gridAccessor) {
+            activeGridCoordsVoxelCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx,
+                                                                 gridAccessor, outCoordsAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=] (int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            activeGridCoordsVoxelCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, outCoordsAcc);
+        auto cb = [=](int64_t batchIdx, int64_t leafIdx, int64_t voxelIdx, int64_t,
+                      GridBatchImpl::Accessor<GridType> gridAccessor) {
+            activeGridCoordsVoxelCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx,
+                                                              gridAccessor, outCoordsAcc);
         };
         forEachVoxelCPU<GridType>(1, gridBatch, cb);
     }
 }
 
-
-/// @brief Get the number of active (or enabled for mutable grids) ijk coordiantes in a batch of grids
+/// @brief Get the number of active (or enabled for mutable grids) ijk coordiantes in a batch of
+/// grids
 /// @tparam DeviceTag Which device to run on
 /// @param gridBatch The batch of grids to get the active coordinates for
-/// @param ignoreDisabledVoxels If set to true, and the grid batch is mutable, also return coordinates that are disabled
+/// @param ignoreDisabledVoxels If set to true, and the grid batch is mutable, also return
+/// coordinates that are disabled
 /// @return A JaggedTensor or shape [B, -1, 3] of active/enabled IJK coordinates
 template <c10::DeviceType DeviceTag>
-JaggedTensor ActiveGridCoords(const GridBatchImpl& gridBatch, bool ignoreDisabledVoxels) {
+JaggedTensor
+ActiveGridCoords(const GridBatchImpl &gridBatch, bool ignoreDisabledVoxels) {
     gridBatch.checkNonEmptyGrid();
-    auto opts = torch::TensorOptions().dtype(torch::kInt32).device(gridBatch.device());
-    torch::Tensor outGridCoords = torch::empty({gridBatch.totalEnabledVoxels(ignoreDisabledVoxels), 3}, opts);
+    auto          opts = torch::TensorOptions().dtype(torch::kInt32).device(gridBatch.device());
+    torch::Tensor outGridCoords =
+        torch::empty({ gridBatch.totalEnabledVoxels(ignoreDisabledVoxels), 3 }, opts);
     FVDB_DISPATCH_GRID_TYPES(gridBatch, [&]() {
-        if (ignoreDisabledVoxels || nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
+        if (ignoreDisabledVoxels ||
+            nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
             GetActiveGridCoords<DeviceTag, GridType>(gridBatch, outGridCoords);
         } else if (nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value) {
             TORCH_CHECK(!ignoreDisabledVoxels, "This should never happen");
@@ -121,19 +135,18 @@ JaggedTensor ActiveGridCoords(const GridBatchImpl& gridBatch, bool ignoreDisable
     return gridBatch.jaggedTensor(outGridCoords, ignoreDisabledVoxels);
 }
 
-
-
 template <>
-JaggedTensor dispatchActiveGridCoords<torch::kCUDA>(const GridBatchImpl& gridBatch, bool ignoreMasked) {
+JaggedTensor
+dispatchActiveGridCoords<torch::kCUDA>(const GridBatchImpl &gridBatch, bool ignoreMasked) {
     return ActiveGridCoords<torch::kCUDA>(gridBatch, ignoreMasked);
 }
 
 template <>
-JaggedTensor dispatchActiveGridCoords<torch::kCPU>(const GridBatchImpl& gridBatch, bool ignoreMasked) {
+JaggedTensor
+dispatchActiveGridCoords<torch::kCPU>(const GridBatchImpl &gridBatch, bool ignoreMasked) {
     return ActiveGridCoords<torch::kCPU>(gridBatch, ignoreMasked);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/ActiveVoxelsInBoundsMask.cu b/fvdb/src/detail/ops/ActiveVoxelsInBoundsMask.cu
index d6527d9395..8b646394de 100644
--- a/fvdb/src/detail/ops/ActiveVoxelsInBoundsMask.cu
+++ b/fvdb/src/detail/ops/ActiveVoxelsInBoundsMask.cu
@@ -1,133 +1,153 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/nanovdb/CustomAccessors.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/nanovdb/CustomAccessors.h"
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
-/// @brief Per-voxel callback to compute a mask of the enabled voxels in a bounding box for a batch of grids
+/// @brief Per-voxel callback to compute a mask of the enabled voxels in a bounding box for a batch
+/// of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void enabledGridVoxelInBoundsMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
-                                                             GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                             TorchAccessor<int64_t, 1> leafBaseOffset,
-                                                             TorchAccessor<int32_t, 3> bboxes,
-                                                             TorchAccessor<bool, 1> outGridBoundsMask) {
-    const nanovdb::CoordBBox maskBbox(nanovdb::Coord(bboxes[batchIdx][0][0], bboxes[batchIdx][0][1], bboxes[batchIdx][0][2]),
-                                      nanovdb::Coord(bboxes[batchIdx][1][0], bboxes[batchIdx][1][1], bboxes[batchIdx][1][2]));
-
-    const nanovdb::NanoGrid<GridType> *grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+enabledGridVoxelInBoundsMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                     GridBatchImpl::Accessor<GridType> gridAccessor,
+                                     TorchAccessor<int64_t, 1>         leafBaseOffset,
+                                     TorchAccessor<int32_t, 3>         bboxes,
+                                     TorchAccessor<bool, 1>            outGridBoundsMask) {
+    const nanovdb::CoordBBox maskBbox(
+        nanovdb::Coord(bboxes[batchIdx][0][0], bboxes[batchIdx][0][1], bboxes[batchIdx][0][2]),
+        nanovdb::Coord(bboxes[batchIdx][1][0], bboxes[batchIdx][1][1], bboxes[batchIdx][1][2]));
+
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     if (maskBbox.hasOverlap(leaf.bbox())) {
         const nanovdb::Coord ijk = leaf.offsetToGlobalCoord(voxelIdx);
         if (leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx) && maskBbox.isInside(ijk)) {
-            const int64_t outIdx = leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
+            const int64_t outIdx =
+                leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
             outGridBoundsMask[outIdx] = true;
         }
     }
 }
 
-/// @brief Per-voxel callback to compute a mask of the active grid voxels in a bounding box for a batch of grids
+/// @brief Per-voxel callback to compute a mask of the active grid voxels in a bounding box for a
+/// batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void activeGridVoxelInBoundsMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
-                                                            GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                            TorchAccessor<int32_t, 3> bboxes,
-                                                            TorchAccessor<bool, 1> outGridBoundsMask) {
-
-    const nanovdb::CoordBBox maskBbox(nanovdb::Coord(bboxes[batchIdx][0][0], bboxes[batchIdx][0][1], bboxes[batchIdx][0][2]),
-                                      nanovdb::Coord(bboxes[batchIdx][1][0], bboxes[batchIdx][1][1], bboxes[batchIdx][1][2]));
-
-    const nanovdb::NanoGrid<GridType> *grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+activeGridVoxelInBoundsMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                    GridBatchImpl::Accessor<GridType> gridAccessor,
+                                    TorchAccessor<int32_t, 3>         bboxes,
+                                    TorchAccessor<bool, 1>            outGridBoundsMask) {
+    const nanovdb::CoordBBox maskBbox(
+        nanovdb::Coord(bboxes[batchIdx][0][0], bboxes[batchIdx][0][1], bboxes[batchIdx][0][2]),
+        nanovdb::Coord(bboxes[batchIdx][1][0], bboxes[batchIdx][1][1], bboxes[batchIdx][1][2]));
+
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     if (maskBbox.hasOverlap(leaf.bbox())) {
         const nanovdb::Coord ijk = leaf.offsetToGlobalCoord(voxelIdx);
         if (leaf.isActive(voxelIdx) && maskBbox.isInside(ijk)) {
             const int64_t baseOffset = gridAccessor.voxelOffset(batchIdx);
-            const int64_t idx = baseOffset + (int64_t)leaf.getValue(voxelIdx) - 1;
-            outGridBoundsMask[idx] = true;
+            const int64_t idx        = baseOffset + (int64_t)leaf.getValue(voxelIdx) - 1;
+            outGridBoundsMask[idx]   = true;
         }
     }
 }
 
-/// @brief Get a boolean mask of the enabled grid voxels for a batch of grids (ignoring disabled voxels)
+/// @brief Get a boolean mask of the enabled grid voxels for a batch of grids (ignoring disabled
+/// voxels)
 /// @param gridBatch The batch of grids (must be mutable)
 /// @param batchBboxes The batch of bounding boxes
 /// @param outGridCoords Tensor which will contain the output grid coordinates
 template <c10::DeviceType DeviceTag>
-void GetEnabledVoxelsInBoundsMask(const GridBatchImpl& gridBatch,
-                                  torch::Tensor& batchBboxes,
-                                  torch::Tensor& outGridBoundsMask) {
+void
+GetEnabledVoxelsInBoundsMask(const GridBatchImpl &gridBatch, torch::Tensor &batchBboxes,
+                             torch::Tensor &outGridBoundsMask) {
     using GridType = nanovdb::ValueOnIndexMask;
 
     // Compute a prefix sum of the unmasked voxels per leaf
-    const torch::Tensor leafBaseOffset = countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
+    const torch::Tensor leafBaseOffset =
+        countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
 
     // Get the unmasked grid coordinates
     auto leafBaseOffsetAcc = tensorAccessor<DeviceTag, int64_t, 1>(leafBaseOffset);
-    auto outMaskAcc = tensorAccessor<DeviceTag, bool, 1>(outGridBoundsMask);
-    auto bboxAcc = tensorAccessor<DeviceTag, int32_t, 3>(batchBboxes);
+    auto outMaskAcc        = tensorAccessor<DeviceTag, bool, 1>(outGridBoundsMask);
+    auto bboxAcc           = tensorAccessor<DeviceTag, int32_t, 3>(batchBboxes);
 
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridVoxelInBoundsMaskCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, bboxAcc, outMaskAcc);
+        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                                 GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridVoxelInBoundsMaskCallback<GridType, TorchRAcc32>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, bboxAcc, outMaskAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridVoxelInBoundsMaskCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, bboxAcc, outMaskAcc);
+        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                      GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridVoxelInBoundsMaskCallback<GridType, TorchAcc>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, bboxAcc, outMaskAcc);
         };
         forEachVoxelCPU<GridType>(1, gridBatch, cb);
     }
 }
 
-/// @brief Get a boolean mask of the active grid voxels for a batch of grids  (including disabled coordinates in mutable grids)
+/// @brief Get a boolean mask of the active grid voxels for a batch of grids  (including disabled
+/// coordinates in mutable grids)
 /// @tparam GridType The type of the grid (one of ValueOnIndex, ValueOnIndexMask)
 /// @param gridBatch The batch of grids
 /// @param batchBboxes The batch of bounding boxes
 /// @param outGridCoords Tensor which will contain the output grid coordinates
 template <c10::DeviceType DeviceTag, typename GridType>
-void GetActiveVoxelsInBoundsMask(const GridBatchImpl& gridBatch,
-                                 torch::Tensor& batchBboxes,
-                                 torch::Tensor& outGridBoundsMask) {
+void
+GetActiveVoxelsInBoundsMask(const GridBatchImpl &gridBatch, torch::Tensor &batchBboxes,
+                            torch::Tensor &outGridBoundsMask) {
     auto outMaskAcc = tensorAccessor<DeviceTag, bool, 1>(outGridBoundsMask);
-    auto bboxAcc = tensorAccessor<DeviceTag, int32_t, 3>(batchBboxes);
+    auto bboxAcc    = tensorAccessor<DeviceTag, int32_t, 3>(batchBboxes);
 
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            activeGridVoxelInBoundsMaskCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, bboxAcc, outMaskAcc);
+        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                                 GridBatchImpl::Accessor<GridType> gridAccessor) {
+            activeGridVoxelInBoundsMaskCallback<GridType, TorchRAcc32>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, bboxAcc, outMaskAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            activeGridVoxelInBoundsMaskCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, bboxAcc, outMaskAcc);
+        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                      GridBatchImpl::Accessor<GridType> gridAccessor) {
+            activeGridVoxelInBoundsMaskCallback<GridType, TorchAcc>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, bboxAcc, outMaskAcc);
         };
         forEachVoxelCPU<GridType>(1, gridBatch, cb);
     }
 }
 
 template <c10::DeviceType DeviceTag>
-JaggedTensor ActiveVoxelsInBoundsMask(const GridBatchImpl& batchHdl,
-                                      const Vec3iBatch& ijkMin,
-                                      const Vec3iBatch& ijkMax,
-                                      bool ignoreDisabledVoxels) {
-
+JaggedTensor
+ActiveVoxelsInBoundsMask(const GridBatchImpl &batchHdl, const Vec3iBatch &ijkMin,
+                         const Vec3iBatch &ijkMax, bool ignoreDisabledVoxels) {
     batchHdl.checkNonEmptyGrid();
 
     // output storage
-    auto opts = torch::TensorOptions().dtype(torch::kBool).device(batchHdl.device());
-    torch::Tensor outGridBoundsMask = torch::zeros({batchHdl.totalEnabledVoxels(ignoreDisabledVoxels)}, opts);
+    auto          opts = torch::TensorOptions().dtype(torch::kBool).device(batchHdl.device());
+    torch::Tensor outGridBoundsMask =
+        torch::zeros({ batchHdl.totalEnabledVoxels(ignoreDisabledVoxels) }, opts);
 
     // bbox to tensor storage
-    const std::vector<nanovdb::Coord>& bboxMins = ijkMin.value(batchHdl.batchSize(), false, "ijk_min");
-    const std::vector<nanovdb::Coord>& bboxMaxs = ijkMax.value(batchHdl.batchSize(), false, "ijk_max");
+    const std::vector<nanovdb::Coord> &bboxMins =
+        ijkMin.value(batchHdl.batchSize(), false, "ijk_min");
+    const std::vector<nanovdb::Coord> &bboxMaxs =
+        ijkMax.value(batchHdl.batchSize(), false, "ijk_max");
 
-    torch::Tensor batchBboxes = torch::empty({batchHdl.batchSize(), 2, 3},
-                                             torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device()));
+    torch::Tensor batchBboxes =
+        torch::empty({ batchHdl.batchSize(), 2, 3 },
+                     torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device()));
 
     for (size_t batchIdx = 0; batchIdx < batchHdl.batchSize(); batchIdx++) {
         for (size_t dimIdx = 0; dimIdx < 3; dimIdx++) {
@@ -138,8 +158,10 @@ JaggedTensor ActiveVoxelsInBoundsMask(const GridBatchImpl& batchHdl,
 
     // create boolean mask of active voxels
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        if (ignoreDisabledVoxels || nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
-            GetActiveVoxelsInBoundsMask<DeviceTag, GridType>(batchHdl, batchBboxes, outGridBoundsMask);
+        if (ignoreDisabledVoxels ||
+            nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
+            GetActiveVoxelsInBoundsMask<DeviceTag, GridType>(batchHdl, batchBboxes,
+                                                             outGridBoundsMask);
         } else if (nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value) {
             TORCH_CHECK(!ignoreDisabledVoxels, "This should never happen");
             GetEnabledVoxelsInBoundsMask<DeviceTag>(batchHdl, batchBboxes, outGridBoundsMask);
@@ -149,21 +171,24 @@ JaggedTensor ActiveVoxelsInBoundsMask(const GridBatchImpl& batchHdl,
     return batchHdl.jaggedTensor(outGridBoundsMask, ignoreDisabledVoxels);
 }
 
-
 template <>
-JaggedTensor dispatchActiveVoxelsInBoundsMask<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                          const Vec3iBatch& boundsMinIjk,
-                                                          const Vec3iBatch& boundsMaxIjk,
-                                                          bool ignoreDisabledVoxels) {
-    return ActiveVoxelsInBoundsMask<torch::kCUDA>(batchHdl, boundsMinIjk, boundsMaxIjk, ignoreDisabledVoxels);
+JaggedTensor
+dispatchActiveVoxelsInBoundsMask<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                               const Vec3iBatch    &boundsMinIjk,
+                                               const Vec3iBatch    &boundsMaxIjk,
+                                               bool                 ignoreDisabledVoxels) {
+    return ActiveVoxelsInBoundsMask<torch::kCUDA>(batchHdl, boundsMinIjk, boundsMaxIjk,
+                                                  ignoreDisabledVoxels);
 }
 
 template <>
-JaggedTensor dispatchActiveVoxelsInBoundsMask<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                         const Vec3iBatch& boundsMinIjk,
-                                                         const Vec3iBatch& boundsMaxIjk,
-                                                         bool ignoreDisabledVoxels) {
-    return ActiveVoxelsInBoundsMask<torch::kCPU>(batchHdl, boundsMinIjk, boundsMaxIjk, ignoreDisabledVoxels);
+JaggedTensor
+dispatchActiveVoxelsInBoundsMask<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                              const Vec3iBatch    &boundsMinIjk,
+                                              const Vec3iBatch    &boundsMaxIjk,
+                                              bool                 ignoreDisabledVoxels) {
+    return ActiveVoxelsInBoundsMask<torch::kCPU>(batchHdl, boundsMinIjk, boundsMaxIjk,
+                                                 ignoreDisabledVoxels);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/BuildDeviceGrid.cu b/fvdb/src/detail/ops/BuildDeviceGrid.cu
index 790875d8ee..11288be6ac 100644
--- a/fvdb/src/detail/ops/BuildDeviceGrid.cu
+++ b/fvdb/src/detail/ops/BuildDeviceGrid.cu
@@ -1,37 +1,32 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include "detail/utils/Utils.h"
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/Utils.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/GridBatchImpl.h"
-#include "detail/build/Build.h"
+#include <detail/GridBatchImpl.h>
+#include <detail/build/Build.h>
+
+#include <nanovdb/tools/cuda/PointsToGrid.cuh>
 
-#include <c10/cuda/CUDAMathCompat.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <c10/cuda/CUDAMathCompat.h>
 
 #include <thrust/device_vector.h>
 
-#include <nanovdb/tools/cuda/PointsToGrid.cuh>
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 template <typename GridType, template <typename T, int> typename TensorAccessorT>
-__hostdev__ void populateGridMetadataKernel(
-    uint32_t numGrids,
-    const nanovdb::NanoGrid<GridType>* grids,
-    const nanovdb::Vec3d* voxelSizes,
-    const nanovdb::Vec3d* voxelOrigins,
-    TensorAccessorT<fvdb::JOffsetsType, 1> gridOffsets,
-    GridBatchImpl::GridMetadata* perGridMetadata,
-    GridBatchImpl::GridBatchMetadata* batchMetadata) {
-
-    batchMetadata->mMaxVoxels = 0;
+__hostdev__ void
+populateGridMetadataKernel(uint32_t numGrids, const nanovdb::NanoGrid<GridType> *grids,
+                           const nanovdb::Vec3d *voxelSizes, const nanovdb::Vec3d *voxelOrigins,
+                           TensorAccessorT<fvdb::JOffsetsType, 1> gridOffsets,
+                           GridBatchImpl::GridMetadata           *perGridMetadata,
+                           GridBatchImpl::GridBatchMetadata      *batchMetadata) {
+    batchMetadata->mMaxVoxels    = 0;
     batchMetadata->mMaxLeafCount = 0;
 
     batchMetadata->mIsMutable = nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value;
@@ -39,90 +34,91 @@ __hostdev__ void populateGridMetadataKernel(
     nanovdb::Coord bbMin = nanovdb::Coord::max();
     nanovdb::Coord bbMax = nanovdb::Coord::min();
 
-    nanovdb::NanoGrid<GridType>* currentGrid = (nanovdb::NanoGrid<GridType>*) &grids[0];
-    uint32_t i = 0;
-    uint64_t byteCount = 0;
+    nanovdb::NanoGrid<GridType> *currentGrid = (nanovdb::NanoGrid<GridType> *)&grids[0];
+    uint32_t                     i           = 0;
+    uint64_t                     byteCount   = 0;
 
     perGridMetadata[i].mCumVoxels = 0;
-    perGridMetadata[i].mCumBytes = 0;
+    perGridMetadata[i].mCumBytes  = 0;
     perGridMetadata[i].mCumLeaves = 0;
 
     gridOffsets[i] = 0;
     while (i < numGrids - 1) {
-        byteCount = currentGrid->gridSize();
-        const uint32_t leafCount = currentGrid->tree().nodeCount(0);
+        byteCount                 = currentGrid->gridSize();
+        const uint32_t leafCount  = currentGrid->tree().nodeCount(0);
         const uint64_t voxelCount = currentGrid->tree().activeVoxelCount();
 
-        GridBatchImpl::GridMetadata& metaCur = perGridMetadata[i];
-        GridBatchImpl::GridMetadata& metaNext = perGridMetadata[i + 1];
+        GridBatchImpl::GridMetadata &metaCur  = perGridMetadata[i];
+        GridBatchImpl::GridMetadata &metaNext = perGridMetadata[i + 1];
 
         metaCur.setTransform(voxelSizes[i], voxelOrigins[i]);
         metaCur.mNumVoxels = voxelCount;
-        metaCur.mNumBytes = byteCount;
+        metaCur.mNumBytes  = byteCount;
         metaCur.mNumLeaves = leafCount;
-        metaCur.mBBox = currentGrid->tree().bbox();
+        metaCur.mBBox      = currentGrid->tree().bbox();
 
         metaNext.mCumVoxels = metaCur.mCumVoxels + voxelCount;
-        metaNext.mCumBytes = metaCur.mCumBytes + byteCount;
+        metaNext.mCumBytes  = metaCur.mCumBytes + byteCount;
         metaNext.mCumLeaves = metaCur.mCumLeaves + leafCount;
 
-        gridOffsets[i+1] = metaCur.mCumVoxels + metaCur.mNumVoxels;
+        gridOffsets[i + 1] = metaCur.mCumVoxels + metaCur.mNumVoxels;
 
         // number of voxels exceeds maximum indexable value
         assert(voxelCount <= std::numeric_limits<int64_t>::max());
-        batchMetadata->mMaxVoxels = max(batchMetadata->mMaxVoxels, static_cast<int64_t>(voxelCount));
+        batchMetadata->mMaxVoxels =
+            max(batchMetadata->mMaxVoxels, static_cast<int64_t>(voxelCount));
         batchMetadata->mMaxLeafCount = max(batchMetadata->mMaxLeafCount, leafCount);
 
-        bbMin = bbMin.minComponent(currentGrid->tree().bbox().min());
-        bbMax = bbMax.maxComponent(currentGrid->tree().bbox().max());
-        currentGrid = (nanovdb::NanoGrid<GridType>*) (((uint8_t*) currentGrid) + byteCount);
+        bbMin       = bbMin.minComponent(currentGrid->tree().bbox().min());
+        bbMax       = bbMax.maxComponent(currentGrid->tree().bbox().max());
+        currentGrid = (nanovdb::NanoGrid<GridType> *)(((uint8_t *)currentGrid) + byteCount);
         i += 1;
     }
 
     perGridMetadata[i].setTransform(voxelSizes[i], voxelOrigins[i]);
     perGridMetadata[i].mNumVoxels = currentGrid->tree().activeVoxelCount();
-    perGridMetadata[i].mNumBytes = currentGrid->gridSize();
+    perGridMetadata[i].mNumBytes  = currentGrid->gridSize();
     perGridMetadata[i].mNumLeaves = currentGrid->tree().nodeCount(0);
-    perGridMetadata[i].mBBox = currentGrid->tree().bbox();
+    perGridMetadata[i].mBBox      = currentGrid->tree().bbox();
 
-    gridOffsets[i+1] = perGridMetadata[i].mCumVoxels + perGridMetadata[i].mNumVoxels;
+    gridOffsets[i + 1] = perGridMetadata[i].mCumVoxels + perGridMetadata[i].mNumVoxels;
 
-    batchMetadata->mMaxVoxels = max(batchMetadata->mMaxVoxels, perGridMetadata[i].mNumVoxels);
+    batchMetadata->mMaxVoxels    = max(batchMetadata->mMaxVoxels, perGridMetadata[i].mNumVoxels);
     batchMetadata->mMaxLeafCount = max(batchMetadata->mMaxLeafCount, perGridMetadata[i].mNumLeaves);
 
     // number of voxels exceeds maximum indexable value
-    assert(perGridMetadata[i].mCumVoxels + perGridMetadata[i].mNumVoxels <= std::numeric_limits<int64_t>::max());
+    assert(perGridMetadata[i].mCumVoxels + perGridMetadata[i].mNumVoxels <=
+           std::numeric_limits<int64_t>::max());
     batchMetadata->mTotalVoxels = perGridMetadata[i].mCumVoxels + perGridMetadata[i].mNumVoxels;
 
     // number of grid leaf nodes exceeds maximum indexable value
-    assert(perGridMetadata[i].mCumLeaves + perGridMetadata[i].mNumLeaves <= std::numeric_limits<int64_t>::max());
+    assert(perGridMetadata[i].mCumLeaves + perGridMetadata[i].mNumLeaves <=
+           std::numeric_limits<int64_t>::max());
     batchMetadata->mTotalLeaves = perGridMetadata[i].mCumLeaves + perGridMetadata[i].mNumLeaves;
 
-    bbMin = bbMin.minComponent(currentGrid->tree().bbox().min());
-    bbMax = bbMax.maxComponent(currentGrid->tree().bbox().max());
+    bbMin                     = bbMin.minComponent(currentGrid->tree().bbox().min());
+    bbMax                     = bbMax.maxComponent(currentGrid->tree().bbox().max());
     batchMetadata->mTotalBBox = nanovdb::CoordBBox(bbMin, bbMax);
 
     batchMetadata->mIsMutable = nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value;
 }
 
-
 template <typename GridType, template <typename T, int I> typename TensorAccessorT>
-__global__ void populateGridMetadataCUDA(
-    uint32_t numGrids,
-    const nanovdb::NanoGrid<GridType>* grids,
-    const nanovdb::Vec3d* voxelSizes,
-    const nanovdb::Vec3d* voxelOrigins,
-    TensorAccessorT<fvdb::JOffsetsType, 1> outBatchOffsets,
-    GridBatchImpl::GridMetadata* perGridMetadata,
-    GridBatchImpl::GridBatchMetadata* batchMetadata) {
-
-    populateGridMetadataKernel<GridType, TensorAccessorT>(numGrids, grids, voxelSizes, voxelOrigins, outBatchOffsets, perGridMetadata, batchMetadata);
+__global__ void
+populateGridMetadataCUDA(uint32_t numGrids, const nanovdb::NanoGrid<GridType> *grids,
+                         const nanovdb::Vec3d *voxelSizes, const nanovdb::Vec3d *voxelOrigins,
+                         TensorAccessorT<fvdb::JOffsetsType, 1> outBatchOffsets,
+                         GridBatchImpl::GridMetadata           *perGridMetadata,
+                         GridBatchImpl::GridBatchMetadata      *batchMetadata) {
+    populateGridMetadataKernel<GridType, TensorAccessorT>(
+        numGrids, grids, voxelSizes, voxelOrigins, outBatchOffsets, perGridMetadata, batchMetadata);
 }
 
-
-__global__ void ijkForDense(nanovdb::Coord origin, nanovdb::Coord size, TorchRAcc32<int32_t, 2> outIJKAccessor) {
-    const int32_t w = size[0], h = size[1], d = size[2];
-    const uint64_t tid = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x; // = x * (h * d) + y * d + z)
+__global__ void
+ijkForDense(nanovdb::Coord origin, nanovdb::Coord size, TorchRAcc32<int32_t, 2> outIJKAccessor) {
+    const int32_t  w = size[0], h = size[1], d = size[2];
+    const uint64_t tid = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) +
+                         threadIdx.x; // = x * (h * d) + y * d + z)
 
     if (tid >= outIJKAccessor.size(0)) {
         return;
@@ -137,34 +133,35 @@ __global__ void ijkForDense(nanovdb::Coord origin, nanovdb::Coord size, TorchRAc
     outIJKAccessor[tid][2] = zi + origin[2];
 }
 
-
 struct NanoVDBGridBuilderTorchAllocator {
-    std::set<void*> mAllocatedData;
+    std::set<void *> mAllocatedData;
 
-    cudaError_t DeviceAllocate(void** ptr, size_t size, cudaStream_t stream) {
+    cudaError_t
+    DeviceAllocate(void **ptr, size_t size, cudaStream_t stream) {
         *ptr = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(size, stream);
         mAllocatedData.insert(*ptr);
-        return (cudaError_t) CUDA_SUCCESS;
+        return (cudaError_t)CUDA_SUCCESS;
     }
 
-    cudaError_t DeviceFree(void* ptr) {
+    cudaError_t
+    DeviceFree(void *ptr) {
         c10::cuda::CUDACachingAllocator::raw_delete(ptr);
         mAllocatedData.erase(ptr);
-        return (cudaError_t) CUDA_SUCCESS;
+        return (cudaError_t)CUDA_SUCCESS;
     }
 
-    void FreeAllCached() {
-        for (void* ptr : mAllocatedData) {
+    void
+    FreeAllCached() {
+        for (void *ptr: mAllocatedData) {
             c10::cuda::CUDACachingAllocator::raw_delete(ptr);
         }
         mAllocatedData.clear();
     }
 };
 
-
 template <>
-nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK<torch::kCUDA>(
-    const JaggedTensor& ijk, bool isMutable) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+dispatchCreateNanoGridFromIJK<torch::kCUDA>(const JaggedTensor &ijk, bool isMutable) {
     TORCH_CHECK(ijk.is_contiguous(), "ijk must be contiguous");
     TORCH_CHECK(ijk.device().is_cuda(), "device must be cuda");
     TORCH_CHECK(ijk.device().has_index(), "device must have index");
@@ -175,15 +172,17 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK<torch::kCUD
     static_assert(sizeof(nanovdb::Coord) == 3 * sizeof(int32_t), "nanovdb::Coord must be 3 ints");
 
     nanovdb::GridHandle<TorchDeviceBuffer> ret = FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
-        // This guide buffer is a hack to pass in a device with an index to the cudaCreateNanoGrid function. We can't pass in a device directly
-        // but we can pass in a buffer which gets passed to TorchDeviceBuffer::create. The guide buffer holds the device and
-        // effectively passes it to the created buffer.
+        // This guide buffer is a hack to pass in a device with an index to the cudaCreateNanoGrid
+        // function. We can't pass in a device directly but we can pass in a buffer which gets
+        // passed to TorchDeviceBuffer::create. The guide buffer holds the device and effectively
+        // passes it to the created buffer.
         TorchDeviceBuffer guide(0, nullptr, false, ijk.device().index());
 
-        // FIXME: This is slow because we have to copy this data to the host and then build the grids. Ideally we want to do this in a single invocation.
+        // FIXME: This is slow because we have to copy this data to the host and then build the
+        // grids. Ideally we want to do this in a single invocation.
         torch::Tensor ijkBOffsetTensor = ijk.joffsets().cpu();
-        auto ijkBOffset = ijkBOffsetTensor.accessor<fvdb::JOffsetsType, 1>();
-        torch::Tensor ijkData = ijk.jdata();
+        auto          ijkBOffset       = ijkBOffsetTensor.accessor<fvdb::JOffsetsType, 1>();
+        torch::Tensor ijkData          = ijk.jdata();
         TORCH_CHECK(ijkData.is_contiguous(), "ijk must be contiguous");
         TORCH_CHECK(ijkData.dim() == 2, "ijk must have shape (N, 3)");
         TORCH_CHECK(ijkData.size(1) == 3, "ijk must have shape (N, 3)");
@@ -192,13 +191,16 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK<torch::kCUD
         std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> handles;
         for (int i = 0; i < (ijkBOffset.size(0) - 1); i += 1) {
             const int64_t startIdx = ijkBOffset[i];
-            const int64_t nVoxels = ijkBOffset[i+1] - startIdx;
+            const int64_t nVoxels  = ijkBOffset[i + 1] - startIdx;
             // torch::Tensor ijkDataSlice = ijkData.narrow(0, startIdx, nVoxels);
-            const int32_t* dataPtr = ijkData.data_ptr<int32_t>() + 3 * startIdx;
-
-            handles.push_back(nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable) :
-                nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord*, TorchDeviceBuffer, NanoVDBGridBuilderTorchAllocator>(
-                    (nanovdb::Coord*) dataPtr, nVoxels, 1.0, guide));
+            const int32_t *dataPtr = ijkData.data_ptr<int32_t>() + 3 * startIdx;
+
+            handles.push_back(
+                nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable)
+                             : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
+                                                                  TorchDeviceBuffer,
+                                                                  NanoVDBGridBuilderTorchAllocator>(
+                                   (nanovdb::Coord *)dataPtr, nVoxels, 1.0, guide));
             C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
 
@@ -206,7 +208,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK<torch::kCUD
             // If there's only one handle, just return it
             return std::move(handles[0]);
         } else {
-            // This copies all the handles into a single handle -- only do it if there are multie grids
+            // This copies all the handles into a single handle -- only do it if there are multie
+            // grids
             return nanovdb::cuda::mergeGridHandles(handles, &guide);
         }
     });
@@ -214,14 +217,12 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK<torch::kCUD
     return ret;
 }
 
-
 template <>
-nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense<torch::kCUDA>(uint32_t batchSize,
-                                                                                       nanovdb::Coord origin,
-                                                                                       nanovdb::Coord size,
-                                                                                       bool isMutable,
-                                                                                       torch::Device device,
-                                                                                       const torch::optional<torch::Tensor>& maybeMask) {
+nanovdb::GridHandle<TorchDeviceBuffer>
+dispatchCreateNanoGridFromDense<torch::kCUDA>(uint32_t batchSize, nanovdb::Coord origin,
+                                              nanovdb::Coord size, bool isMutable,
+                                              torch::Device                         device,
+                                              const torch::optional<torch::Tensor> &maybeMask) {
     TORCH_CHECK(device.is_cuda(), "device must be cuda");
     TORCH_CHECK(device.has_index(), "device must have index");
 
@@ -230,29 +231,28 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense<torch::kC
     const int64_t gridVolume = static_cast<int64_t>(size[0]) * size[1] * size[2];
 
     constexpr int NUM_THREADS = 1024;
-    const int64_t NUM_BLOCKS = GET_BLOCKS(gridVolume, NUM_THREADS);
-
+    const int64_t NUM_BLOCKS  = GET_BLOCKS(gridVolume, NUM_THREADS);
 
-    const torch::TensorOptions opts = torch::TensorOptions().dtype(torch::kInt32).device(device);
-    torch::Tensor ijkData = torch::empty({gridVolume, 3}, opts);
+    const torch::TensorOptions opts    = torch::TensorOptions().dtype(torch::kInt32).device(device);
+    torch::Tensor              ijkData = torch::empty({ gridVolume, 3 }, opts);
 
     if (NUM_BLOCKS > 0) {
         ijkForDense<<<NUM_BLOCKS, NUM_THREADS>>>(
-                origin, size,
-                ijkData.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>());
+            origin, size, ijkData.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
     if (maybeMask.has_value()) {
-        torch::Tensor mask = maybeMask.value().view({-1});
+        torch::Tensor mask = maybeMask.value().view({ -1 });
         TORCH_CHECK(mask.device() == device, "mask must be on same device as ijkData");
-        ijkData = ijkData.index({mask});
+        ijkData = ijkData.index({ mask });
     }
 
     nanovdb::GridHandle<TorchDeviceBuffer> ret = FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
-        // This guide buffer is a hack to pass in a device with an index to the cudaCreateNanoGrid function. We can't pass in a device directly
-        // but we can pass in a buffer which gets passed to TorchDeviceBuffer::create. The guide buffer holds the device and
-        // effectively passes it to the created buffer.
+        // This guide buffer is a hack to pass in a device with an index to the cudaCreateNanoGrid
+        // function. We can't pass in a device directly but we can pass in a buffer which gets
+        // passed to TorchDeviceBuffer::create. The guide buffer holds the device and effectively
+        // passes it to the created buffer.
         TorchDeviceBuffer guide(0, nullptr, false, device.index());
 
         TORCH_CHECK(ijkData.is_contiguous(), "ijkData must be contiguous");
@@ -261,9 +261,12 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense<torch::kC
         std::vector<nanovdb::GridHandle<TorchDeviceBuffer>> handles;
         for (int i = 0; i < batchSize; i += 1) {
             const int64_t nVoxels = ijkData.size(0);
-            handles.push_back(nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable) :
-                nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord*, TorchDeviceBuffer, NanoVDBGridBuilderTorchAllocator>(
-                    (nanovdb::Coord*) ijkData.data_ptr(), nVoxels, 1.0, guide));
+            handles.push_back(
+                nVoxels == 0 ? build::buildEmptyGrid(guide.device(), isMutable)
+                             : nanovdb::tools::cuda::voxelsToGrid<GridType, nanovdb::Coord *,
+                                                                  TorchDeviceBuffer,
+                                                                  NanoVDBGridBuilderTorchAllocator>(
+                                   (nanovdb::Coord *)ijkData.data_ptr(), nVoxels, 1.0, guide));
             C10_CUDA_KERNEL_LAUNCH_CHECK();
         }
 
@@ -271,7 +274,8 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense<torch::kC
             // If there's only one handle, just return it
             return std::move(handles[0]);
         } else {
-            // This copies all the handles into a single handle -- only do it if there are multie grids
+            // This copies all the handles into a single handle -- only do it if there are multie
+            // grids
             return nanovdb::cuda::mergeGridHandles(handles, &guide);
         }
     });
@@ -279,71 +283,76 @@ nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense<torch::kC
     return ret;
 }
 
-
 template <>
-void dispatchPopulateGridMetadata<torch::kCUDA>(const nanovdb::GridHandle<TorchDeviceBuffer>& gridHdl,
-                                                const std::vector<nanovdb::Vec3d>& voxelSizes,
-                                                const std::vector<nanovdb::Vec3d>& voxelOrigins,
-                                                const bool isMutable,
-                                                torch::Tensor& outBatchOffsets,
-                                                GridBatchImpl::GridMetadata* outPerGridMetadataHost,
-                                                GridBatchImpl::GridMetadata* outPerGridMetadataDevice,
-                                                GridBatchImpl::GridBatchMetadata* outBatchMetadataHost,
-                                                GridBatchImpl::GridBatchMetadata* outBatchMetadataDevice) {
+void
+dispatchPopulateGridMetadata<torch::kCUDA>(
+    const nanovdb::GridHandle<TorchDeviceBuffer> &gridHdl,
+    const std::vector<nanovdb::Vec3d> &voxelSizes, const std::vector<nanovdb::Vec3d> &voxelOrigins,
+    const bool isMutable, torch::Tensor &outBatchOffsets,
+    GridBatchImpl::GridMetadata      *outPerGridMetadataHost,
+    GridBatchImpl::GridMetadata      *outPerGridMetadataDevice,
+    GridBatchImpl::GridBatchMetadata *outBatchMetadataHost,
+    GridBatchImpl::GridBatchMetadata *outBatchMetadataDevice) {
     c10::cuda::CUDAGuard deviceGuard(gridHdl.buffer().device());
 
     // Copy sizes and origins to device buffers
-    RAIIRawDeviceBuffer<nanovdb::Vec3d> deviceVoxSizes(voxelSizes.size(), gridHdl.buffer().device());
-    deviceVoxSizes.setData((nanovdb::Vec3d*) voxelSizes.data(), true /* blocking */);
-    const nanovdb::Vec3d* deviceVoxSizesPtr = deviceVoxSizes.devicePtr;
+    RAIIRawDeviceBuffer<nanovdb::Vec3d> deviceVoxSizes(voxelSizes.size(),
+                                                       gridHdl.buffer().device());
+    deviceVoxSizes.setData((nanovdb::Vec3d *)voxelSizes.data(), true /* blocking */);
+    const nanovdb::Vec3d *deviceVoxSizesPtr = deviceVoxSizes.devicePtr;
 
-    RAIIRawDeviceBuffer<nanovdb::Vec3d> deviceVoxOrigins(voxelOrigins.size(), gridHdl.buffer().device());
-    deviceVoxOrigins.setData((nanovdb::Vec3d*) voxelOrigins.data(), true /* blocking */);
-    const nanovdb::Vec3d* deviceVoxOriginsPtr = deviceVoxOrigins.devicePtr;
+    RAIIRawDeviceBuffer<nanovdb::Vec3d> deviceVoxOrigins(voxelOrigins.size(),
+                                                         gridHdl.buffer().device());
+    deviceVoxOrigins.setData((nanovdb::Vec3d *)voxelOrigins.data(), true /* blocking */);
+    const nanovdb::Vec3d *deviceVoxOriginsPtr = deviceVoxOrigins.devicePtr;
 
-    outBatchOffsets = torch::empty({(fvdb::JOffsetsType) (voxelOrigins.size() + 1)}, torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(gridHdl.buffer().device()));
+    outBatchOffsets = torch::empty(
+        { (fvdb::JOffsetsType)(voxelOrigins.size() + 1) },
+        torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(gridHdl.buffer().device()));
 
     // Read metadata into device buffers
     FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
         TORCH_CHECK(gridHdl.deviceData() != nullptr, "GridHandle is empty");
-        const nanovdb::NanoGrid<GridType>* grids = (nanovdb::NanoGrid<GridType>*) gridHdl.deviceData();
+        const nanovdb::NanoGrid<GridType> *grids =
+            (nanovdb::NanoGrid<GridType> *)gridHdl.deviceData();
         populateGridMetadataCUDA<GridType, TorchRAcc32><<<1, 1>>>(
-            gridHdl.gridCount(), grids,
-            (const nanovdb::Vec3d*) deviceVoxSizesPtr,
-            (const nanovdb::Vec3d*) deviceVoxOriginsPtr,
+            gridHdl.gridCount(), grids, (const nanovdb::Vec3d *)deviceVoxSizesPtr,
+            (const nanovdb::Vec3d *)deviceVoxOriginsPtr,
             outBatchOffsets.packed_accessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>(),
-            outPerGridMetadataDevice,
-            outBatchMetadataDevice);
+            outPerGridMetadataDevice, outBatchMetadataDevice);
     });
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     const size_t metaDataByteSize = sizeof(GridBatchImpl::GridMetadata) * gridHdl.gridCount();
-    cudaMemcpy(outPerGridMetadataHost, outPerGridMetadataDevice, metaDataByteSize, cudaMemcpyDeviceToHost);
-    cudaMemcpy(outBatchMetadataHost, outBatchMetadataDevice, sizeof(GridBatchImpl::GridBatchMetadata), cudaMemcpyDeviceToHost);
+    cudaMemcpy(outPerGridMetadataHost, outPerGridMetadataDevice, metaDataByteSize,
+               cudaMemcpyDeviceToHost);
+    cudaMemcpy(outBatchMetadataHost, outBatchMetadataDevice,
+               sizeof(GridBatchImpl::GridBatchMetadata), cudaMemcpyDeviceToHost);
 }
 
 template <>
-void dispatchPopulateGridMetadata<torch::kCPU>(const nanovdb::GridHandle<TorchDeviceBuffer>& gridHdl,
-                                               const std::vector<nanovdb::Vec3d>& voxelSizes,
-                                               const std::vector<nanovdb::Vec3d>& voxelOrigins,
-                                               const bool isMutable,
-                                               torch::Tensor& outBatchOffsets,
-                                               GridBatchImpl::GridMetadata* outPerGridMetadataHost,
-                                               GridBatchImpl::GridMetadata* outPerGridMetadataDevice,
-                                               GridBatchImpl::GridBatchMetadata* outBatchMetadataHost,
-                                               GridBatchImpl::GridBatchMetadata* outBatchMetadataDevice) {
-
-    outBatchOffsets = torch::empty({(fvdb::JOffsetsType) (voxelOrigins.size() + 1)}, torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(gridHdl.buffer().device()));
+void
+dispatchPopulateGridMetadata<torch::kCPU>(
+    const nanovdb::GridHandle<TorchDeviceBuffer> &gridHdl,
+    const std::vector<nanovdb::Vec3d> &voxelSizes, const std::vector<nanovdb::Vec3d> &voxelOrigins,
+    const bool isMutable, torch::Tensor &outBatchOffsets,
+    GridBatchImpl::GridMetadata      *outPerGridMetadataHost,
+    GridBatchImpl::GridMetadata      *outPerGridMetadataDevice,
+    GridBatchImpl::GridBatchMetadata *outBatchMetadataHost,
+    GridBatchImpl::GridBatchMetadata *outBatchMetadataDevice) {
+    outBatchOffsets = torch::empty(
+        { (fvdb::JOffsetsType)(voxelOrigins.size() + 1) },
+        torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(gridHdl.buffer().device()));
     FVDB_DISPATCH_GRID_TYPES_MUTABLE(isMutable, [&]() {
         TORCH_CHECK(gridHdl.data() != nullptr, "GridHandle is empty");
-        const nanovdb::NanoGrid<GridType>* grids = (nanovdb::NanoGrid<GridType>*) gridHdl.data();
+        const nanovdb::NanoGrid<GridType> *grids = (nanovdb::NanoGrid<GridType> *)gridHdl.data();
         populateGridMetadataKernel<GridType, TorchAcc>(
-            gridHdl.gridCount(), grids, voxelSizes.data(), voxelOrigins.data(), outBatchOffsets.accessor<fvdb::JOffsetsType, 1>(),
-            outPerGridMetadataHost, outBatchMetadataHost);
+            gridHdl.gridCount(), grids, voxelSizes.data(), voxelOrigins.data(),
+            outBatchOffsets.accessor<fvdb::JOffsetsType, 1>(), outPerGridMetadataHost,
+            outBatchMetadataHost);
     });
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/CoordsInGrid.cu b/fvdb/src/detail/ops/CoordsInGrid.cu
index 1816cb9b1c..62e1cae463 100644
--- a/fvdb/src/detail/ops/CoordsInGrid.cu
+++ b/fvdb/src/detail/ops/CoordsInGrid.cu
@@ -1,57 +1,63 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void coordsInGridCallback(int32_t bidx, int32_t eidx,
-                                             JaggedAccessor<ScalarType, 2> ijk,
-                                             TensorAccessor<bool, 1> outMask,
-                                             BatchGridAccessor<GridType> batchAccessor,
-                                             bool ignoreMasked) {
-    const auto* gpuGrid = batchAccessor.grid(bidx);
-    auto primalAcc = gpuGrid->getAccessor();
-
-    const auto& ijkCoord = ijk.data()[eidx];
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+coordsInGridCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> ijk,
+                     TensorAccessor<bool, 1> outMask, BatchGridAccessor<GridType> batchAccessor,
+                     bool ignoreMasked) {
+    const auto *gpuGrid   = batchAccessor.grid(bidx);
+    auto        primalAcc = gpuGrid->getAccessor();
+
+    const auto          &ijkCoord = ijk.data()[eidx];
     const nanovdb::Coord vox(ijkCoord[0], ijkCoord[1], ijkCoord[2]);
-    const bool isActive = ignoreMasked ? primalAcc.isActive(vox) : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
-    outMask[eidx] = isActive;
+    const bool           isActive = ignoreMasked ? primalAcc.isActive(vox)
+                                                 : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
+    outMask[eidx]                 = isActive;
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor CoordsInGrid(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool ignoreMasked) {
-
+JaggedTensor
+CoordsInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &ijk, bool ignoreMasked) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(ijk);
     TORCH_CHECK_TYPE(!ijk.is_floating_point(), "ijk must have an integeral type");
-    TORCH_CHECK(ijk.rdim() == 2, std::string("Expected ijk to have 2 dimensions (shape (n, 3)) but got ") + std::to_string(ijk.rdim()) + " dimensions");
+    TORCH_CHECK(ijk.rdim() == 2,
+                std::string("Expected ijk to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(ijk.rdim()) + " dimensions");
     TORCH_CHECK(ijk.rsize(0) > 0, "Empty tensor (ijk)");
-    TORCH_CHECK(ijk.rsize(1) == 3, "Expected 3 dimensional ijk but got ijk.shape[1] = " + std::to_string(ijk.rsize(1)));
+    TORCH_CHECK(ijk.rsize(1) == 3, "Expected 3 dimensional ijk but got ijk.shape[1] = " +
+                                       std::to_string(ijk.rsize(1)));
 
-    auto opts = torch::TensorOptions().dtype(torch::kBool).device(ijk.device());
-    torch::Tensor outMask = torch::empty({ijk.rsize(0)}, opts);
+    auto          opts    = torch::TensorOptions().dtype(torch::kBool).device(ijk.device());
+    torch::Tensor outMask = torch::empty({ ijk.rsize(0) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_INTEGRAL_TYPES(ijk.scalar_type(), "CoordsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc        = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outMaskAccessor = tensorAccessor<DeviceTag, bool, 1>(outMask);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ijkAcc) {
-                    coordsInGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ijkAcc, outMaskAccessor, batchAcc, ignoreMasked);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ijkAcc) {
+                    coordsInGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, ijkAcc, outMaskAccessor, batchAcc, ignoreMasked);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(1024, 1, ijk, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ijkAcc) {
-                    coordsInGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ijkAcc, outMaskAccessor, batchAcc, ignoreMasked);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ijkAcc) {
+                    coordsInGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ijkAcc, outMaskAccessor, batchAcc, ignoreMasked);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, ijk, cb);
             }
@@ -61,18 +67,20 @@ JaggedTensor CoordsInGrid(const GridBatchImpl& batchHdl, const JaggedTensor& ijk
     return ijk.jagged_like(outMask);
 }
 
-
 template <>
-JaggedTensor dispatchCoordsInGrid<torch::kCUDA>(const GridBatchImpl& batchHdl, const JaggedTensor& coords, bool ignoreMasked) {
+JaggedTensor
+dispatchCoordsInGrid<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                   bool ignoreMasked) {
     return CoordsInGrid<torch::kCUDA>(batchHdl, coords, ignoreMasked);
 }
 
 template <>
-JaggedTensor dispatchCoordsInGrid<torch::kCPU>(const GridBatchImpl& batchHdl, const JaggedTensor& coords, bool ignoreMasked) {
+JaggedTensor
+dispatchCoordsInGrid<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                  bool ignoreMasked) {
     return CoordsInGrid<torch::kCPU>(batchHdl, coords, ignoreMasked);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/CountEnabledVoxels.cu b/fvdb/src/detail/ops/CountEnabledVoxels.cu
index 3a8fc22e38..3706c6eba7 100644
--- a/fvdb/src/detail/ops/CountEnabledVoxels.cu
+++ b/fvdb/src/detail/ops/CountEnabledVoxels.cu
@@ -1,11 +1,10 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/nanovdb/CustomAccessors.h"
+#include <detail/utils/nanovdb/CustomAccessors.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
@@ -16,12 +15,14 @@ namespace ops {
 /// @param li the index of the leaf to process
 /// @param outUnmaskedPerLeaf the output tensor storing the number of unmasked voxels in each leaf
 template <template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void countEnabledPerLeafOneGridCallback(const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* gpuGrid,
-                                                           int32_t li, int32_t ci,
-                                                           TensorAccessor<int64_t, 1> outUnmaskedPerLeaf) {
-    using LeafNodeT = typename nanovdb::NanoTree<nanovdb::ValueOnIndexMask>::LeafNodeType;
-    const LeafNodeT& leaf = gpuGrid->tree().template getFirstNode<0>()[li];
-    outUnmaskedPerLeaf[li] = (int64_t) leaf.template get<TotalUnmaskedPerLeaf<nanovdb::ValueOnIndexMask>>(1111);
+__hostdev__ inline void
+countEnabledPerLeafOneGridCallback(const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *gpuGrid,
+                                   int32_t li, int32_t ci,
+                                   TensorAccessor<int64_t, 1> outUnmaskedPerLeaf) {
+    using LeafNodeT       = typename nanovdb::NanoTree<nanovdb::ValueOnIndexMask>::LeafNodeType;
+    const LeafNodeT &leaf = gpuGrid->tree().template getFirstNode<0>()[li];
+    outUnmaskedPerLeaf[li] =
+        (int64_t)leaf.template get<TotalUnmaskedPerLeaf<nanovdb::ValueOnIndexMask>>(1111);
 }
 
 /// @brief Callback passed to forEachLeafCUDA to count the number of unmasked voxels per leaf
@@ -31,21 +32,22 @@ __hostdev__ inline void countEnabledPerLeafOneGridCallback(const nanovdb::NanoGr
 /// @param outUnmaskedPerLeaf the output tensor storing the number of unmasked voxels in each leaf
 /// @return
 template <template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void countUnmaskedPerLeafCallback(int32_t batchIdx, int32_t leafIdx,
-                                                    GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAccessor,
-                                                    TensorAccessor<int64_t, 1> outUnmaskedPerLeaf) {
-    using GridType = nanovdb::ValueOnIndexMask;
-    const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
-    const int64_t numUnmasked = (int64_t) leaf.template get<TotalUnmaskedPerLeaf<GridType>>(1111);
+__hostdev__ inline void
+countUnmaskedPerLeafCallback(int32_t batchIdx, int32_t leafIdx,
+                             GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAccessor,
+                             TensorAccessor<int64_t, 1> outUnmaskedPerLeaf) {
+    using GridType                                                 = nanovdb::ValueOnIndexMask;
+    const nanovdb::NanoGrid<GridType>                        *grid = batchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
+    const int64_t numUnmasked = (int64_t)leaf.template get<TotalUnmaskedPerLeaf<GridType>>(1111);
     outUnmaskedPerLeaf[batchAccessor.leafOffset(batchIdx) + leafIdx] = numUnmasked;
 }
 
-
-
 template <c10::DeviceType DeviceTag>
-int64_t CountEnabledVoxels(const GridBatchImpl& batchHdl, int batchIdx) {
-    auto opts = torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
+int64_t
+CountEnabledVoxels(const GridBatchImpl &batchHdl, int batchIdx) {
+    auto          opts = torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
     torch::Tensor unmaskedPerLeaf;
     // auto batchAccessor = batchHdl.deviceAccessor<nanovdb::ValueOnIndexMask>();
 
@@ -53,17 +55,23 @@ int64_t CountEnabledVoxels(const GridBatchImpl& batchHdl, int batchIdx) {
         if (batchHdl.totalLeaves() == 0) {
             return 0;
         }
-        // Count the number of unmasked voxels in each leaf node, then cumsum and return the last item to get the total
-        unmaskedPerLeaf = torch::empty({batchHdl.totalLeaves()}, opts);
+        // Count the number of unmasked voxels in each leaf node, then cumsum and return the last
+        // item to get the total
+        unmaskedPerLeaf         = torch::empty({ batchHdl.totalLeaves() }, opts);
         auto unmaskedPerLeafAcc = tensorAccessor<DeviceTag, int64_t, 1>(unmaskedPerLeaf);
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto callback = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t, GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAcc) {
-                countUnmaskedPerLeafCallback<TorchRAcc32>(batchIdx, leafIdx, batchAcc, unmaskedPerLeafAcc);
-            };
+            auto callback =
+                [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t,
+                               GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAcc) {
+                    countUnmaskedPerLeafCallback<TorchRAcc32>(batchIdx, leafIdx, batchAcc,
+                                                              unmaskedPerLeafAcc);
+                };
             forEachLeafCUDA<nanovdb::ValueOnIndexMask>(1024, 1, batchHdl, callback);
         } else {
-            auto callback = [=] (int32_t batchIdx, int32_t leafIdx, int32_t, GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAcc) {
-                countUnmaskedPerLeafCallback<TorchAcc>(batchIdx, leafIdx, batchAcc, unmaskedPerLeafAcc);
+            auto callback = [=](int32_t batchIdx, int32_t leafIdx, int32_t,
+                                GridBatchImpl::Accessor<nanovdb::ValueOnIndexMask> batchAcc) {
+                countUnmaskedPerLeafCallback<TorchAcc>(batchIdx, leafIdx, batchAcc,
+                                                       unmaskedPerLeafAcc);
             };
             forEachLeafCPU<nanovdb::ValueOnIndexMask>(1, batchHdl, callback);
         }
@@ -72,16 +80,21 @@ int64_t CountEnabledVoxels(const GridBatchImpl& batchHdl, int batchIdx) {
             return 0;
         }
         // Count the number of unmasked voxels in each leaf node for a single batch item
-        unmaskedPerLeaf = torch::empty({batchHdl.numLeaves(batchIdx)}, opts);
+        unmaskedPerLeaf         = torch::empty({ batchHdl.numLeaves(batchIdx) }, opts);
         auto unmaskedPerLeafAcc = tensorAccessor<DeviceTag, int64_t, 1>(unmaskedPerLeaf);
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto callback = [=] __device__ (const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* grid, int32_t leafIdx, int32_t cIdx) {
-                countEnabledPerLeafOneGridCallback<TorchRAcc32>(grid, leafIdx, cIdx, unmaskedPerLeafAcc);
+            auto callback = [=] __device__(const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *grid,
+                                           int32_t leafIdx, int32_t cIdx) {
+                countEnabledPerLeafOneGridCallback<TorchRAcc32>(grid, leafIdx, cIdx,
+                                                                unmaskedPerLeafAcc);
             };
-            forEachLeafInOneGridCUDA<nanovdb::ValueOnIndexMask>(1024, 1, batchIdx, batchHdl, callback);
+            forEachLeafInOneGridCUDA<nanovdb::ValueOnIndexMask>(1024, 1, batchIdx, batchHdl,
+                                                                callback);
         } else {
-            auto callback = [=] (const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* grid, int32_t leafIdx, int32_t cIdx) {
-                countEnabledPerLeafOneGridCallback<TorchAcc>(grid, leafIdx, cIdx, unmaskedPerLeafAcc);
+            auto callback = [=](const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *grid,
+                                int32_t leafIdx, int32_t cIdx) {
+                countEnabledPerLeafOneGridCallback<TorchAcc>(grid, leafIdx, cIdx,
+                                                             unmaskedPerLeafAcc);
             };
             forEachLeafInOneGridCPU<nanovdb::ValueOnIndexMask>(1, batchIdx, batchHdl, callback);
         }
@@ -90,15 +103,15 @@ int64_t CountEnabledVoxels(const GridBatchImpl& batchHdl, int batchIdx) {
     return unmaskedPerLeaf.cumsum(0, torch::kInt64)[-1].cpu().item().to<int64_t>();
 }
 
-
-
 template <>
-int64_t dispatchCountEnabledVoxels<torch::kCUDA>(const GridBatchImpl& batchHdl, int batchIdx) {
+int64_t
+dispatchCountEnabledVoxels<torch::kCUDA>(const GridBatchImpl &batchHdl, int batchIdx) {
     return CountEnabledVoxels<torch::kCUDA>(batchHdl, batchIdx);
 }
 
 template <>
-int64_t dispatchCountEnabledVoxels<torch::kCPU>(const GridBatchImpl& batchHdl, int batchIdx) {
+int64_t
+dispatchCountEnabledVoxels<torch::kCPU>(const GridBatchImpl &batchHdl, int batchIdx) {
     return CountEnabledVoxels<torch::kCPU>(batchHdl, batchIdx);
 }
 
diff --git a/fvdb/src/detail/ops/CubesInGrid.cu b/fvdb/src/detail/ops/CubesInGrid.cu
index 262c0d819a..fbb79206df 100644
--- a/fvdb/src/detail/ops/CubesInGrid.cu
+++ b/fvdb/src/detail/ops/CubesInGrid.cu
@@ -1,42 +1,41 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <Types.h>
+#include <detail/utils/cuda/Utils.cuh>
+
+#include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
-#include <c10/util/Half.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <ATen/OpMathType.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "Types.h"
-
+#include <c10/util/Half.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
-template <typename ScalarType, bool IsTouch, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void cubesInGridCallback(int32_t bidx, int32_t eidx,
-                                            JaggedAccessor<ScalarType, 2> points,
-                                            TensorAccessor<bool, 1> outMask,
-                                            BatchGridAccessor<GridType> batchAccessor,
-                                            nanovdb::math::Vec3<at::opmath_type<ScalarType>> deltaStart,
-                                            nanovdb::math::Vec3<at::opmath_type<ScalarType>> deltaEnd,
-                                            bool ignoreMasked) {
-
+template <typename ScalarType, bool IsTouch, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+cubesInGridCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> points,
+                    TensorAccessor<bool, 1> outMask, BatchGridAccessor<GridType> batchAccessor,
+                    nanovdb::math::Vec3<at::opmath_type<ScalarType>> deltaStart,
+                    nanovdb::math::Vec3<at::opmath_type<ScalarType>> deltaEnd, bool ignoreMasked) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto* gpuGrid = batchAccessor.grid(bidx);
-    auto primalAcc = gpuGrid->getAccessor();
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-
-    const auto pointCoord = points.data()[eidx];
-    const nanovdb::math::Vec3<MathType> xyz_s = transform.apply(static_cast<MathType>(pointCoord[0]) + deltaStart[0],
-                                                          static_cast<MathType>(pointCoord[1]) + deltaStart[1],
-                                                          static_cast<MathType>(pointCoord[2]) + deltaStart[2]);
-    const nanovdb::math::Vec3<MathType> xyz_e = transform.apply(static_cast<MathType>(pointCoord[0]) + deltaEnd[0],
-                                                          static_cast<MathType>(pointCoord[1]) + deltaEnd[1],
-                                                          static_cast<MathType>(pointCoord[2]) + deltaEnd[2]);
+    const auto                *gpuGrid   = batchAccessor.grid(bidx);
+    auto                       primalAcc = gpuGrid->getAccessor();
+    const VoxelCoordTransform &transform = batchAccessor.primalTransform(bidx);
+
+    const auto                          pointCoord = points.data()[eidx];
+    const nanovdb::math::Vec3<MathType> xyz_s =
+        transform.apply(static_cast<MathType>(pointCoord[0]) + deltaStart[0],
+                        static_cast<MathType>(pointCoord[1]) + deltaStart[1],
+                        static_cast<MathType>(pointCoord[2]) + deltaStart[2]);
+    const nanovdb::math::Vec3<MathType> xyz_e =
+        transform.apply(static_cast<MathType>(pointCoord[0]) + deltaEnd[0],
+                        static_cast<MathType>(pointCoord[1]) + deltaEnd[1],
+                        static_cast<MathType>(pointCoord[2]) + deltaEnd[2]);
     const nanovdb::Coord s = xyz_s.round();
     const nanovdb::Coord e = xyz_e.round();
 
@@ -44,8 +43,10 @@ __hostdev__ inline void cubesInGridCallback(int32_t bidx, int32_t eidx,
     for (int vx = s[0]; vx <= e[0]; ++vx) {
         for (int vy = s[1]; vy <= e[1]; ++vy) {
             for (int vz = s[2]; vz <= e[2]; ++vz) {
-                const nanovdb::Coord vox{vx, vy, vz};
-                const bool isActive = ignoreMasked ? primalAcc.isActive(vox) : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
+                const nanovdb::Coord vox{ vx, vy, vz };
+                const bool           isActive = ignoreMasked
+                                                    ? primalAcc.isActive(vox)
+                                                    : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
                 // IsTouch -> isActive? True : mask
                 // !IsTouch -> notActive? False : mask
                 mask = (IsTouch == isActive) ? IsTouch : mask;
@@ -55,48 +56,52 @@ __hostdev__ inline void cubesInGridCallback(int32_t bidx, int32_t eidx,
     outMask[eidx] = mask;
 }
 
-
 template <c10::DeviceType DeviceTag, bool IsTouch>
-JaggedTensor CubesInGrid(const GridBatchImpl& batchHdl,
-                               const JaggedTensor& cubeCenters,
-                               const Vec3dOrScalar& padMinTensor,
-                               const Vec3dOrScalar& padMaxTensor,
-                               bool ignoreDisabledVoxels) {
+JaggedTensor
+CubesInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &cubeCenters,
+            const Vec3dOrScalar &padMinTensor, const Vec3dOrScalar &padMaxTensor,
+            bool ignoreDisabledVoxels) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(cubeCenters);
-    TORCH_CHECK_TYPE(cubeCenters.is_floating_point(), "cubeCenters must have a floating point type");
-    TORCH_CHECK(cubeCenters.rdim() == 2, std::string("Expected cubeCenters to have 2 dimensions (shape (n, 3)) but got ") +
-                                        std::to_string(cubeCenters.rdim()) + " dimensions");
+    TORCH_CHECK_TYPE(cubeCenters.is_floating_point(),
+                     "cubeCenters must have a floating point type");
+    TORCH_CHECK(cubeCenters.rdim() == 2,
+                std::string("Expected cubeCenters to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(cubeCenters.rdim()) + " dimensions");
     TORCH_CHECK(cubeCenters.rsize(0) > 0, "Empty tensor (cubeCenters)");
     TORCH_CHECK(cubeCenters.rsize(1) == 3,
                 "Expected 3 dimensional cubeCenters but got cubeCenters.shape[1] = " +
-                std::to_string(cubeCenters.rsize(1)));
+                    std::to_string(cubeCenters.rsize(1)));
 
     const nanovdb::Vec3d padMin = padMinTensor.value();
     const nanovdb::Vec3d padMax = padMaxTensor.value();
 
-    auto opts = torch::TensorOptions().dtype(torch::kBool).device(cubeCenters.device());
-    torch::Tensor outMask = torch::empty({cubeCenters.rsize(0)}, opts);
+    auto          opts    = torch::TensorOptions().dtype(torch::kBool).device(cubeCenters.device());
+    torch::Tensor outMask = torch::empty({ cubeCenters.rsize(0) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(cubeCenters.scalar_type(), "CubesInGrid", [&]() {
             using opmath_t = at::opmath_type<scalar_t>;
 
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc        = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outMaskAccessor = tensorAccessor<DeviceTag, bool, 1>(outMask);
             nanovdb::math::Vec3<opmath_t> dstart(padMin);
             nanovdb::math::Vec3<opmath_t> dend(padMax);
 
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
                     cubesInGridCallback<scalar_t, IsTouch, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, dstart, dend, ignoreDisabledVoxels);
+                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, dstart, dend,
+                        ignoreDisabledVoxels);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, cubeCenters, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
                     cubesInGridCallback<scalar_t, IsTouch, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, dstart, dend, ignoreDisabledVoxels);
+                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, dstart, dend,
+                        ignoreDisabledVoxels);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, cubeCenters, cb);
             }
@@ -106,46 +111,44 @@ JaggedTensor CubesInGrid(const GridBatchImpl& batchHdl,
     return cubeCenters.jagged_like(outMask);
 }
 
-
 template <>
-JaggedTensor dispatchCubesInGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                             const JaggedTensor& cubeCenters,
-                                             const Vec3dOrScalar& padMin,
-                                             const Vec3dOrScalar& padMax,
-                                             bool ignoreDisabledVoxels) {
-    return CubesInGrid<torch::kCUDA, false>(batchHdl, cubeCenters, padMin, padMax, ignoreDisabledVoxels);
+JaggedTensor
+dispatchCubesInGrid<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &cubeCenters,
+                                  const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
+                                  bool ignoreDisabledVoxels) {
+    return CubesInGrid<torch::kCUDA, false>(batchHdl, cubeCenters, padMin, padMax,
+                                            ignoreDisabledVoxels);
 }
 
-
 template <>
-JaggedTensor dispatchCubesInGrid<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                            const JaggedTensor& cubeCenters,
-                                            const Vec3dOrScalar& padMin,
-                                            const Vec3dOrScalar& padMax,
-                                            bool ignoreDisabledVoxels) {
-    return CubesInGrid<torch::kCPU, false>(batchHdl, cubeCenters, padMin, padMax, ignoreDisabledVoxels);
+JaggedTensor
+dispatchCubesInGrid<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &cubeCenters,
+                                 const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
+                                 bool ignoreDisabledVoxels) {
+    return CubesInGrid<torch::kCPU, false>(batchHdl, cubeCenters, padMin, padMax,
+                                           ignoreDisabledVoxels);
 }
 
 template <>
-JaggedTensor dispatchCubesIntersectGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                             const JaggedTensor& cubeCenters,
-                                             const Vec3dOrScalar& padMin,
-                                             const Vec3dOrScalar& padMax,
-                                             bool ignoreDisabledVoxels) {
-    return CubesInGrid<torch::kCUDA, true>(batchHdl, cubeCenters, padMin, padMax, ignoreDisabledVoxels);
+JaggedTensor
+dispatchCubesIntersectGrid<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                         const JaggedTensor  &cubeCenters,
+                                         const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
+                                         bool ignoreDisabledVoxels) {
+    return CubesInGrid<torch::kCUDA, true>(batchHdl, cubeCenters, padMin, padMax,
+                                           ignoreDisabledVoxels);
 }
 
-
 template <>
-JaggedTensor dispatchCubesIntersectGrid<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                            const JaggedTensor& cubeCenters,
-                                            const Vec3dOrScalar& padMin,
-                                            const Vec3dOrScalar& padMax,
-                                            bool ignoreDisabledVoxels) {
-    return CubesInGrid<torch::kCPU, true>(batchHdl, cubeCenters, padMin, padMax, ignoreDisabledVoxels);
+JaggedTensor
+dispatchCubesIntersectGrid<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                        const JaggedTensor  &cubeCenters,
+                                        const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
+                                        bool ignoreDisabledVoxels) {
+    return CubesInGrid<torch::kCPU, true>(batchHdl, cubeCenters, padMin, padMax,
+                                          ignoreDisabledVoxels);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/DownsampleGridAvgPool.cu b/fvdb/src/detail/ops/DownsampleGridAvgPool.cu
index d0e2aebf94..40c2e0f7d3 100644
--- a/fvdb/src/detail/ops/DownsampleGridAvgPool.cu
+++ b/fvdb/src/detail/ops/DownsampleGridAvgPool.cu
@@ -1,45 +1,43 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
+#include <ATen/AccumulateType.h>
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
-#include <ATen/AccumulateType.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
-template <typename Dtype, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void avgPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                             GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                             GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                             const TensorAccessor<Dtype, 2> fineData,
-                                             TensorAccessor<Dtype, 2> outCoarseData,
-                                             nanovdb::Coord poolingFactor,
-                                             nanovdb::Coord stride,
-                                             Dtype avgFactor) {
-    using accscalar_t = at::acc_type<Dtype, true>;
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& coarseLeaf = coarseGrid->tree().template getFirstNode<0>()[leafIdx];
-    const auto fineGridAcc = fineGrid->getAccessor();
+template <typename Dtype, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+avgPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                     GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                     GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                     const TensorAccessor<Dtype, 2>    fineData,
+                     TensorAccessor<Dtype, 2> outCoarseData, nanovdb::Coord poolingFactor,
+                     nanovdb::Coord stride, Dtype avgFactor) {
+    using accscalar_t                             = at::acc_type<Dtype, true>;
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &coarseLeaf =
+        coarseGrid->tree().template getFirstNode<0>()[leafIdx];
+    const auto    fineGridAcc      = fineGrid->getAccessor();
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
     const int64_t coarseVoxelIndex = coarseLeaf.getValue(voxelIdx);
 
     if (coarseVoxelIndex == 0) {
         return;
     }
     const nanovdb::Coord coarseIjk = coarseLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0],
-                                  coarseIjk[1] * stride[1],
+    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0], coarseIjk[1] * stride[1],
                                   coarseIjk[2] * stride[2]);
     const int64_t coarseIndex = coarseVoxelIndex - static_cast<int64_t>(1) + coarseBaseOffset;
-    accscalar_t avgValue = static_cast<accscalar_t>(0.0);
+    accscalar_t   avgValue    = static_cast<accscalar_t>(0.0);
 
     for (unsigned i = 0; i < poolingFactor[0]; i += 1) {
         for (unsigned j = 0; j < poolingFactor[1]; j += 1) {
@@ -48,7 +46,8 @@ __hostdev__ inline void avgPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx,
                 if (!fineGridAcc.template get<ActiveOrUnmasked<GridType>>(fineIjk)) {
                     continue;
                 }
-                const int64_t fineIndex = (int64_t) fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
+                const int64_t fineIndex =
+                    (int64_t)fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
                 avgValue += static_cast<accscalar_t>(fineData[fineIndex][channelIdx]);
             }
         }
@@ -57,32 +56,30 @@ __hostdev__ inline void avgPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx,
     outCoarseData[coarseIndex][channelIdx] = static_cast<Dtype>(avgValue) * avgFactor;
 }
 
-
-template <typename Dtype, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void avgPoolBackardVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                    GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                                    GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                                    const TensorAccessor<Dtype, 2> fineData,
-                                                    const TensorAccessor<Dtype, 2> coarseGradOut,
-                                                    TensorAccessor<Dtype, 2> outFineGradIn,
-                                                    nanovdb::Coord poolingFactor,
-                                                    nanovdb::Coord stride,
-                                                    Dtype avgFactor) {
-
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& coarseLeaf = coarseGrid->tree().template getFirstNode<0>()[leafIdx];
-    const auto fineGridAcc = fineGrid->getAccessor();
+template <typename Dtype, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+avgPoolBackardVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                            GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                            GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                            const TensorAccessor<Dtype, 2>    fineData,
+                            const TensorAccessor<Dtype, 2>    coarseGradOut,
+                            TensorAccessor<Dtype, 2> outFineGradIn, nanovdb::Coord poolingFactor,
+                            nanovdb::Coord stride, Dtype avgFactor) {
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &coarseLeaf =
+        coarseGrid->tree().template getFirstNode<0>()[leafIdx];
+    const auto    fineGridAcc      = fineGrid->getAccessor();
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
 
     int64_t leafValue = coarseLeaf.getValue(voxelIdx);
     if (leafValue == 0) {
         return;
     }
     const nanovdb::Coord coarseIjk = coarseLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0],
-                                  coarseIjk[1] * stride[1],
+    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0], coarseIjk[1] * stride[1],
                                   coarseIjk[2] * stride[2]);
 
     for (unsigned i = 0; i < poolingFactor[0]; i += 1) {
@@ -92,24 +89,22 @@ __hostdev__ inline void avgPoolBackardVoxelCallback(int32_t batchIdx, int32_t le
                 if (!fineGridAcc.template get<ActiveOrUnmasked<GridType>>(fineIjk)) {
                     continue;
                 }
-                const int64_t fineIndex = (int64_t) fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
-                outFineGradIn[fineIndex][channelIdx] = coarseGradOut[leafValue - 1 + coarseBaseOffset][channelIdx] * avgFactor;
+                const int64_t fineIndex =
+                    (int64_t)fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
+                outFineGradIn[fineIndex][channelIdx] =
+                    coarseGradOut[leafValue - 1 + coarseBaseOffset][channelIdx] * avgFactor;
             }
         }
     }
-
 }
 
-
-
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor DownsampleGridAvgPool(const GridBatchImpl& fineBatchHdl,
-                                    const GridBatchImpl& coarseBatchHdl,
-                                    const torch::Tensor& fineData,
-                                    nanovdb::Coord poolingFactor,
-                                    nanovdb::Coord stride) {
-    TORCH_CHECK(fineBatchHdl.device() == coarseBatchHdl.device(), "coarse_grid and fine_grid must be on the same device");
+torch::Tensor
+DownsampleGridAvgPool(const GridBatchImpl &fineBatchHdl, const GridBatchImpl &coarseBatchHdl,
+                      const torch::Tensor &fineData, nanovdb::Coord poolingFactor,
+                      nanovdb::Coord stride) {
+    TORCH_CHECK(fineBatchHdl.device() == coarseBatchHdl.device(),
+                "coarse_grid and fine_grid must be on the same device");
     for (int i = 0; i < 3; i += 1) {
         TORCH_CHECK_VALUE(poolingFactor[i] > 0, "pooling_factor must be greater than 0");
         TORCH_CHECK_VALUE(stride[i] >= 0, "stride must be greater than or equal to 0");
@@ -122,55 +117,64 @@ torch::Tensor DownsampleGridAvgPool(const GridBatchImpl& fineBatchHdl,
     fineBatchHdl.checkNonEmptyGrid();
     coarseBatchHdl.checkDevice(fineData);
 
-    TORCH_CHECK(fineData.dim() > 1, "fine_data must have more than one dimension. i.e. have shape (num_voxels, *)");
-    TORCH_CHECK(fineData.size(0) == (int64_t) fineBatchHdl.totalVoxels(), "fine_data must have the same number of voxels as fine_grid");
+    TORCH_CHECK(fineData.dim() > 1,
+                "fine_data must have more than one dimension. i.e. have shape (num_voxels, *)");
+    TORCH_CHECK(fineData.size(0) == (int64_t)fineBatchHdl.totalVoxels(),
+                "fine_data must have the same number of voxels as fine_grid");
 
-    int64_t numOutputValues = coarseBatchHdl.totalVoxels();
-    auto opts = torch::TensorOptions().dtype(fineData.dtype()).device(fineData.device());
-    torch::Tensor outCoarseData = torch::zeros(spliceShape({numOutputValues}, fineData), opts);
+    int64_t       numOutputValues = coarseBatchHdl.totalVoxels();
+    auto          opts = torch::TensorOptions().dtype(fineData.dtype()).device(fineData.device());
+    torch::Tensor outCoarseData = torch::zeros(spliceShape({ numOutputValues }, fineData), opts);
 
-    torch::Tensor fineDataReshape = featureCoalescedView(fineData);
+    torch::Tensor fineDataReshape      = featureCoalescedView(fineData);
     torch::Tensor outCoarseDataReshape = featureCoalescedView(outCoarseData);
-    double avgFactor = 1.0 / (poolingFactor[0] * poolingFactor[1] * poolingFactor[2]);
+    double        avgFactor = 1.0 / (poolingFactor[0] * poolingFactor[1] * poolingFactor[2]);
 
     FVDB_DISPATCH_GRID_TYPES(fineBatchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          fineData.scalar_type(), "DownsampleGridAvgPool", [&]() {
-            auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
-            auto fineDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
-            auto outCoarseDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outCoarseDataReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto avgPoolPerVoxel = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    avgPoolVoxelCallback<scalar_t, GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor, fineBatchAcc,
-                                                                                fineDataAcc, outCoarseDataAcc, poolingFactor, stride,
-                                                                                static_cast<scalar_t>(avgFactor));
-                };
-                forEachVoxelCUDA<GridType>(384, outCoarseData.size(1), coarseBatchHdl, avgPoolPerVoxel);
-            } else {
-                auto avgPoolPerVoxel = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    avgPoolVoxelCallback<scalar_t, GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor, fineBatchAcc,
-                                                                             fineDataAcc, outCoarseDataAcc, poolingFactor, stride,
-                                                                             static_cast<scalar_t>(avgFactor));
-                };
-                forEachVoxelCPU<GridType>(outCoarseData.size(1), coarseBatchHdl, avgPoolPerVoxel);
-            }
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, fineData.scalar_type(),
+            "DownsampleGridAvgPool", [&]() {
+                auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
+                auto fineDataAcc  = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
+                auto outCoarseDataAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2>(outCoarseDataReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto avgPoolPerVoxel =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            avgPoolVoxelCallback<scalar_t, GridType, TorchRAcc32>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, outCoarseDataAcc, poolingFactor, stride,
+                                static_cast<scalar_t>(avgFactor));
+                        };
+                    forEachVoxelCUDA<GridType>(384, outCoarseData.size(1), coarseBatchHdl,
+                                               avgPoolPerVoxel);
+                } else {
+                    auto avgPoolPerVoxel =
+                        [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                            GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            avgPoolVoxelCallback<scalar_t, GridType, TorchAcc>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, outCoarseDataAcc, poolingFactor, stride,
+                                static_cast<scalar_t>(avgFactor));
+                        };
+                    forEachVoxelCPU<GridType>(outCoarseData.size(1), coarseBatchHdl,
+                                              avgPoolPerVoxel);
+                }
+            });
     });
 
     return outCoarseData;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor DownsampleGridAvgPoolBackward(const GridBatchImpl& coarseBatchHdl,
-                                            const GridBatchImpl& fineBatchHdl,
-                                            const torch::Tensor& fineData,
-                                            const torch::Tensor& coarseGradOut,
-                                            nanovdb::Coord poolingFactor,
-                                            nanovdb::Coord stride) {
-
+torch::Tensor
+DownsampleGridAvgPoolBackward(const GridBatchImpl &coarseBatchHdl,
+                              const GridBatchImpl &fineBatchHdl, const torch::Tensor &fineData,
+                              const torch::Tensor &coarseGradOut, nanovdb::Coord poolingFactor,
+                              nanovdb::Coord stride) {
     TORCH_CHECK(fineData.is_contiguous(), "fine_data must be contiguous");
     TORCH_CHECK(coarseGradOut.is_contiguous(), "coarse_grad_out must be contiguous");
     for (int i = 0; i < 3; i += 1) {
@@ -181,84 +185,90 @@ torch::Tensor DownsampleGridAvgPoolBackward(const GridBatchImpl& coarseBatchHdl,
         }
     }
 
-    torch::Tensor fineDataReshape = featureCoalescedView(fineData);
+    torch::Tensor fineDataReshape      = featureCoalescedView(fineData);
     torch::Tensor coarseGradOutReshape = featureCoalescedView(coarseGradOut);
-    torch::Tensor outGradInReshape = torch::zeros_like(fineDataReshape);  // [#fin
-    double avgFactor = 1.0 / (poolingFactor[0] * poolingFactor[1] * poolingFactor[2]);
+    torch::Tensor outGradInReshape     = torch::zeros_like(fineDataReshape); // [#fin
+    double        avgFactor = 1.0 / (poolingFactor[0] * poolingFactor[1] * poolingFactor[2]);
 
     FVDB_DISPATCH_GRID_TYPES(fineBatchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          fineData.scalar_type(), "DownsampleGridAvgPoolBackward", [&]() {
-
-            auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
-            auto fineDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
-            auto coarseGradOutAcc = tensorAccessor<DeviceTag, scalar_t, 2>(coarseGradOutReshape);
-            auto outFineGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradInReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    avgPoolBackardVoxelCallback<scalar_t, GridType, TorchRAcc32>(
-                        batchIdx, leafIdx, voxelIdx, channelIdx,
-                        coarseBatchAccessor, fineBatchAcc,
-                        fineDataAcc, coarseGradOutAcc, outFineGradInAcc, poolingFactor, stride,
-                        static_cast<scalar_t>(avgFactor));
-                };
-                forEachVoxelCUDA<GridType>(384, fineData.size(1), coarseBatchHdl, cb);
-            } else {
-                auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    avgPoolBackardVoxelCallback<scalar_t, GridType, TorchAcc>(
-                        batchIdx, leafIdx, voxelIdx, channelIdx,
-                        coarseBatchAccessor, fineBatchAcc,
-                        fineDataAcc, coarseGradOutAcc, outFineGradInAcc, poolingFactor, stride,
-                        static_cast<scalar_t>(avgFactor));
-                };
-                forEachVoxelCPU<GridType>(fineData.size(1), coarseBatchHdl, cb);
-            }
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, fineData.scalar_type(),
+            "DownsampleGridAvgPoolBackward", [&]() {
+                auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
+                auto fineDataAcc  = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
+                auto coarseGradOutAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2>(coarseGradOutReshape);
+                auto outFineGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradInReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            avgPoolBackardVoxelCallback<scalar_t, GridType, TorchRAcc32>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, coarseGradOutAcc, outFineGradInAcc,
+                                poolingFactor, stride, static_cast<scalar_t>(avgFactor));
+                        };
+                    forEachVoxelCUDA<GridType>(384, fineData.size(1), coarseBatchHdl, cb);
+                } else {
+                    auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                  int32_t                           channelIdx,
+                                  GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                        avgPoolBackardVoxelCallback<scalar_t, GridType, TorchAcc>(
+                            batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                            fineBatchAcc, fineDataAcc, coarseGradOutAcc, outFineGradInAcc,
+                            poolingFactor, stride, static_cast<scalar_t>(avgFactor));
+                    };
+                    forEachVoxelCPU<GridType>(fineData.size(1), coarseBatchHdl, cb);
+                }
+            });
     });
-    return outGradInReshape.reshape(spliceShape({fineData.size(0)}, coarseGradOut));
+    return outGradInReshape.reshape(spliceShape({ fineData.size(0) }, coarseGradOut));
 }
 
-
-
 template <>
-torch::Tensor dispatchDownsampleGridAvgPool<torch::kCUDA>(const GridBatchImpl& fineBatchHdl,
-                                                          const GridBatchImpl& coarseBatchHdl,
-                                                          const torch::Tensor& fineData,
-                                                          nanovdb::Coord poolingFactor,
-                                                          nanovdb::Coord stride) {
-    return DownsampleGridAvgPool<torch::kCUDA>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridAvgPool<torch::kCUDA>(const GridBatchImpl &fineBatchHdl,
+                                            const GridBatchImpl &coarseBatchHdl,
+                                            const torch::Tensor &fineData,
+                                            nanovdb::Coord poolingFactor, nanovdb::Coord stride) {
+    return DownsampleGridAvgPool<torch::kCUDA>(fineBatchHdl, coarseBatchHdl, fineData,
+                                               poolingFactor, stride);
 }
 
 template <>
-torch::Tensor dispatchDownsampleGridAvgPool<torch::kCPU>(const GridBatchImpl& fineBatchHdl,
-                                                         const GridBatchImpl& coarseBatchHdl,
-                                                         const torch::Tensor& fineData,
-                                                         nanovdb::Coord poolingFactor,
-                                                         nanovdb::Coord stride) {
-    return DownsampleGridAvgPool<torch::kCPU>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridAvgPool<torch::kCPU>(const GridBatchImpl &fineBatchHdl,
+                                           const GridBatchImpl &coarseBatchHdl,
+                                           const torch::Tensor &fineData,
+                                           nanovdb::Coord poolingFactor, nanovdb::Coord stride) {
+    return DownsampleGridAvgPool<torch::kCPU>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor,
+                                              stride);
 }
 
-
 template <>
-torch::Tensor dispatchDownsampleGridAvgPoolBackward<torch::kCUDA>(const GridBatchImpl& coarseBatchHdl,
-                                                                  const GridBatchImpl& fineBatchHdl,
-                                                                  const torch::Tensor& fineData,
-                                                                  const torch::Tensor& coarseGradOut,
-                                                                  nanovdb::Coord poolingFactor,
-                                                                  nanovdb::Coord stride) {
-    return DownsampleGridAvgPoolBackward<torch::kCUDA>(coarseBatchHdl, fineBatchHdl, fineData, coarseGradOut, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridAvgPoolBackward<torch::kCUDA>(const GridBatchImpl &coarseBatchHdl,
+                                                    const GridBatchImpl &fineBatchHdl,
+                                                    const torch::Tensor &fineData,
+                                                    const torch::Tensor &coarseGradOut,
+                                                    nanovdb::Coord       poolingFactor,
+                                                    nanovdb::Coord       stride) {
+    return DownsampleGridAvgPoolBackward<torch::kCUDA>(coarseBatchHdl, fineBatchHdl, fineData,
+                                                       coarseGradOut, poolingFactor, stride);
 }
 
 template <>
-torch::Tensor dispatchDownsampleGridAvgPoolBackward<torch::kCPU>(const GridBatchImpl& coarseBatchHdl,
-                                                                 const GridBatchImpl& fineBatchHdl,
-                                                                 const torch::Tensor& fineData,
-                                                                 const torch::Tensor& coarseGradOut,
-                                                                 nanovdb::Coord poolingFactor,
-                                                                 nanovdb::Coord stride) {
-    return DownsampleGridAvgPoolBackward<torch::kCPU>(coarseBatchHdl, fineBatchHdl, fineData, coarseGradOut, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridAvgPoolBackward<torch::kCPU>(const GridBatchImpl &coarseBatchHdl,
+                                                   const GridBatchImpl &fineBatchHdl,
+                                                   const torch::Tensor &fineData,
+                                                   const torch::Tensor &coarseGradOut,
+                                                   nanovdb::Coord       poolingFactor,
+                                                   nanovdb::Coord       stride) {
+    return DownsampleGridAvgPoolBackward<torch::kCPU>(coarseBatchHdl, fineBatchHdl, fineData,
+                                                      coarseGradOut, poolingFactor, stride);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/DownsampleGridMaxPool.cu b/fvdb/src/detail/ops/DownsampleGridMaxPool.cu
index a0910c3b45..9fe5248b64 100644
--- a/fvdb/src/detail/ops/DownsampleGridMaxPool.cu
+++ b/fvdb/src/detail/ops/DownsampleGridMaxPool.cu
@@ -1,40 +1,38 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
-template <typename Dtype, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void maxPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                             GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                             GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                             const TensorAccessor<Dtype, 2> fineData,
-                                             TensorAccessor<Dtype, 2> outCoarseData,
-                                             nanovdb::Coord poolingFactor,
-                                             nanovdb::Coord stride) {
-
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& coarseLeaf = coarseGrid->tree().template getFirstNode<0>()[leafIdx];
-    const auto fineGridAcc = fineGrid->getAccessor();
+template <typename Dtype, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+maxPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                     GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                     GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                     const TensorAccessor<Dtype, 2>    fineData,
+                     TensorAccessor<Dtype, 2> outCoarseData, nanovdb::Coord poolingFactor,
+                     nanovdb::Coord stride) {
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &coarseLeaf =
+        coarseGrid->tree().template getFirstNode<0>()[leafIdx];
+    const auto    fineGridAcc      = fineGrid->getAccessor();
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
     const int64_t coarseVoxelIndex = coarseLeaf.getValue(voxelIdx);
 
     if (coarseVoxelIndex == 0) {
         return;
     }
     const nanovdb::Coord coarseIjk = coarseLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0],
-                                  coarseIjk[1] * stride[1],
+    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0], coarseIjk[1] * stride[1],
                                   coarseIjk[2] * stride[2]);
     const int64_t coarseIndex = coarseVoxelIndex - static_cast<int64_t>(1) + coarseBaseOffset;
     outCoarseData[coarseIndex][channelIdx] = -INFINITY;
@@ -51,7 +49,8 @@ __hostdev__ inline void maxPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx,
                 if (!fineGridAcc.template get<ActiveOrUnmasked<GridType>>(fineIjk)) {
                     continue;
                 }
-                const int64_t fineIndex = (int64_t) fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
+                const int64_t fineIndex =
+                    (int64_t)fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
                 const Dtype currentValue = outCoarseData[coarseIndex][channelIdx];
                 outCoarseData[coarseIndex][channelIdx] =
                     c10::cuda::compat::max(fineData[fineIndex][channelIdx], currentValue);
@@ -60,34 +59,33 @@ __hostdev__ inline void maxPoolVoxelCallback(int32_t batchIdx, int32_t leafIdx,
     }
 }
 
-
-template <typename Dtype, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void maxPoolBackardVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                    GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                                    GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                                    const TensorAccessor<Dtype, 2> fineData,
-                                                    const TensorAccessor<Dtype, 2> coarseGradOut,
-                                                    TensorAccessor<Dtype, 2> outFineGradIn,
-                                                    nanovdb::Coord poolingFactor,
-                                                    nanovdb::Coord stride) {
-
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& coarseLeaf = coarseGrid->tree().template getFirstNode<0>()[leafIdx];
-    const auto fineGridAcc = fineGrid->getAccessor();
+template <typename Dtype, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+maxPoolBackardVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                            GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                            GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                            const TensorAccessor<Dtype, 2>    fineData,
+                            const TensorAccessor<Dtype, 2>    coarseGradOut,
+                            TensorAccessor<Dtype, 2> outFineGradIn, nanovdb::Coord poolingFactor,
+                            nanovdb::Coord stride) {
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &coarseLeaf =
+        coarseGrid->tree().template getFirstNode<0>()[leafIdx];
+    const auto    fineGridAcc      = fineGrid->getAccessor();
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
 
     int64_t leafValue = coarseLeaf.getValue(voxelIdx);
     if (leafValue == 0) {
         return;
     }
     const nanovdb::Coord coarseIjk = coarseLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0],
-                                  coarseIjk[1] * stride[1],
+    const nanovdb::Coord fineIjk0(coarseIjk[0] * stride[0], coarseIjk[1] * stride[1],
                                   coarseIjk[2] * stride[2]);
 
-    Dtype maxValue = -INFINITY;
+    Dtype   maxValue = -INFINITY;
     int64_t maxIndex = -1;
 
     for (unsigned i = 0; i < poolingFactor[0]; i += 1) {
@@ -98,7 +96,8 @@ __hostdev__ inline void maxPoolBackardVoxelCallback(int32_t batchIdx, int32_t le
                     continue;
                 }
 
-                const int64_t fineIndex = (int64_t) fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
+                const int64_t fineIndex =
+                    (int64_t)fineGridAcc.getValue(fineIjk) + fineBaseOffset - 1;
                 const Dtype fineValue = fineData[fineIndex][channelIdx];
                 if (fineValue > maxValue) {
                     maxIndex = fineIndex;
@@ -109,20 +108,18 @@ __hostdev__ inline void maxPoolBackardVoxelCallback(int32_t batchIdx, int32_t le
     }
 
     if (maxIndex >= 0) {
-        outFineGradIn[maxIndex][channelIdx] = coarseGradOut[leafValue - 1 + coarseBaseOffset][channelIdx];
+        outFineGradIn[maxIndex][channelIdx] =
+            coarseGradOut[leafValue - 1 + coarseBaseOffset][channelIdx];
     }
 }
 
-
-
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor DownsampleGridMaxPool(const GridBatchImpl& fineBatchHdl,
-                                    const GridBatchImpl& coarseBatchHdl,
-                                    const torch::Tensor& fineData,
-                                    nanovdb::Coord poolingFactor,
-                                    nanovdb::Coord stride) {
-    TORCH_CHECK(fineBatchHdl.device() == coarseBatchHdl.device(), "coarse_grid and fine_grid must be on the same device");
+torch::Tensor
+DownsampleGridMaxPool(const GridBatchImpl &fineBatchHdl, const GridBatchImpl &coarseBatchHdl,
+                      const torch::Tensor &fineData, nanovdb::Coord poolingFactor,
+                      nanovdb::Coord stride) {
+    TORCH_CHECK(fineBatchHdl.device() == coarseBatchHdl.device(),
+                "coarse_grid and fine_grid must be on the same device");
     for (int i = 0; i < 3; i += 1) {
         TORCH_CHECK_VALUE(poolingFactor[i] > 0, "pooling_factor must be greater than 0");
         TORCH_CHECK_VALUE(stride[i] >= 0, "stride must be greater than or equal to 0");
@@ -135,52 +132,61 @@ torch::Tensor DownsampleGridMaxPool(const GridBatchImpl& fineBatchHdl,
     fineBatchHdl.checkNonEmptyGrid();
     coarseBatchHdl.checkDevice(fineData);
 
-    TORCH_CHECK(fineData.dim() > 1, "fine_data must have more than one dimension. i.e. have shape (num_voxels, *)");
-    TORCH_CHECK(fineData.size(0) == (int64_t) fineBatchHdl.totalVoxels(), "fine_data must have the same number of voxels as fine_grid");
+    TORCH_CHECK(fineData.dim() > 1,
+                "fine_data must have more than one dimension. i.e. have shape (num_voxels, *)");
+    TORCH_CHECK(fineData.size(0) == (int64_t)fineBatchHdl.totalVoxels(),
+                "fine_data must have the same number of voxels as fine_grid");
 
-    int64_t numOutputValues = coarseBatchHdl.totalVoxels();
-    auto opts = torch::TensorOptions().dtype(fineData.dtype()).device(fineData.device());
-    torch::Tensor outCoarseData = torch::empty(spliceShape({numOutputValues}, fineData), opts);
+    int64_t       numOutputValues = coarseBatchHdl.totalVoxels();
+    auto          opts = torch::TensorOptions().dtype(fineData.dtype()).device(fineData.device());
+    torch::Tensor outCoarseData = torch::empty(spliceShape({ numOutputValues }, fineData), opts);
 
-    torch::Tensor fineDataReshape = featureCoalescedView(fineData);
+    torch::Tensor fineDataReshape      = featureCoalescedView(fineData);
     torch::Tensor outCoarseDataReshape = featureCoalescedView(outCoarseData);
 
     FVDB_DISPATCH_GRID_TYPES(fineBatchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          fineData.scalar_type(), "DownsampleGridMaxPool", [&]() {
-            auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
-            auto fineDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
-            auto outCoarseDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outCoarseDataReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto maxPoolPerVoxel = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    maxPoolVoxelCallback<scalar_t, GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor, fineBatchAcc,
-                                                                                fineDataAcc, outCoarseDataAcc, poolingFactor, stride);
-                };
-                forEachVoxelCUDA<GridType>(384, outCoarseData.size(1), coarseBatchHdl, maxPoolPerVoxel);
-            } else {
-                auto maxPoolPerVoxel = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    maxPoolVoxelCallback<scalar_t, GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor, fineBatchAcc,
-                                                                             fineDataAcc, outCoarseDataAcc, poolingFactor, stride);
-                };
-                forEachVoxelCPU<GridType>(outCoarseData.size(1), coarseBatchHdl, maxPoolPerVoxel);
-            }
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, fineData.scalar_type(),
+            "DownsampleGridMaxPool", [&]() {
+                auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
+                auto fineDataAcc  = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
+                auto outCoarseDataAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2>(outCoarseDataReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto maxPoolPerVoxel =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            maxPoolVoxelCallback<scalar_t, GridType, TorchRAcc32>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, outCoarseDataAcc, poolingFactor, stride);
+                        };
+                    forEachVoxelCUDA<GridType>(384, outCoarseData.size(1), coarseBatchHdl,
+                                               maxPoolPerVoxel);
+                } else {
+                    auto maxPoolPerVoxel =
+                        [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                            GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            maxPoolVoxelCallback<scalar_t, GridType, TorchAcc>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, outCoarseDataAcc, poolingFactor, stride);
+                        };
+                    forEachVoxelCPU<GridType>(outCoarseData.size(1), coarseBatchHdl,
+                                              maxPoolPerVoxel);
+                }
+            });
     });
 
     return outCoarseData;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor DownsampleGridMaxPoolBackward(const GridBatchImpl& coarseBatchHdl,
-                                            const GridBatchImpl& fineBatchHdl,
-                                            const torch::Tensor& fineData,
-                                            const torch::Tensor& coarseGradOut,
-                                            nanovdb::Coord poolingFactor,
-                                            nanovdb::Coord stride) {
-
+torch::Tensor
+DownsampleGridMaxPoolBackward(const GridBatchImpl &coarseBatchHdl,
+                              const GridBatchImpl &fineBatchHdl, const torch::Tensor &fineData,
+                              const torch::Tensor &coarseGradOut, nanovdb::Coord poolingFactor,
+                              nanovdb::Coord stride) {
     TORCH_CHECK(fineData.is_contiguous(), "fine_data must be contiguous");
     TORCH_CHECK(coarseGradOut.is_contiguous(), "coarse_grad_out must be contiguous");
     for (int i = 0; i < 3; i += 1) {
@@ -190,80 +196,89 @@ torch::Tensor DownsampleGridMaxPoolBackward(const GridBatchImpl& coarseBatchHdl,
             stride[i] = poolingFactor[i];
         }
     }
-    torch::Tensor fineDataReshape = featureCoalescedView(fineData);
+    torch::Tensor fineDataReshape      = featureCoalescedView(fineData);
     torch::Tensor coarseGradOutReshape = featureCoalescedView(coarseGradOut);
-    torch::Tensor outGradInReshape = torch::zeros_like(fineDataReshape);  // [#fin
+    torch::Tensor outGradInReshape     = torch::zeros_like(fineDataReshape); // [#fin
 
     FVDB_DISPATCH_GRID_TYPES(fineBatchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          fineData.scalar_type(), "DownsampleGridMaxPoolBackward", [&]() {
-
-            auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
-            auto fineDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
-            auto coarseGradOutAcc = tensorAccessor<DeviceTag, scalar_t, 2>(coarseGradOutReshape);
-            auto outFineGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradInReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    maxPoolBackardVoxelCallback<scalar_t, GridType, TorchRAcc32>(
-                        batchIdx, leafIdx, voxelIdx, channelIdx,
-                        coarseBatchAccessor, fineBatchAcc,
-                        fineDataAcc, coarseGradOutAcc, outFineGradInAcc, poolingFactor, stride);
-                };
-                forEachVoxelCUDA<GridType>(384, fineData.size(1), coarseBatchHdl, cb);
-            } else {
-                auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
-                    maxPoolBackardVoxelCallback<scalar_t, GridType, TorchAcc>(
-                        batchIdx, leafIdx, voxelIdx, channelIdx,
-                        coarseBatchAccessor, fineBatchAcc,
-                        fineDataAcc, coarseGradOutAcc, outFineGradInAcc, poolingFactor, stride);
-                };
-                forEachVoxelCPU<GridType>(fineData.size(1), coarseBatchHdl, cb);
-            }
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, fineData.scalar_type(),
+            "DownsampleGridMaxPoolBackward", [&]() {
+                auto fineBatchAcc = gridBatchAccessor<DeviceTag, GridType>(fineBatchHdl);
+                auto fineDataAcc  = tensorAccessor<DeviceTag, scalar_t, 2>(fineDataReshape);
+                auto coarseGradOutAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2>(coarseGradOutReshape);
+                auto outFineGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradInReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                            maxPoolBackardVoxelCallback<scalar_t, GridType, TorchRAcc32>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                                fineBatchAcc, fineDataAcc, coarseGradOutAcc, outFineGradInAcc,
+                                poolingFactor, stride);
+                        };
+                    forEachVoxelCUDA<GridType>(384, fineData.size(1), coarseBatchHdl, cb);
+                } else {
+                    auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                  int32_t                           channelIdx,
+                                  GridBatchImpl::Accessor<GridType> coarseBatchAccessor) {
+                        maxPoolBackardVoxelCallback<scalar_t, GridType, TorchAcc>(
+                            batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAccessor,
+                            fineBatchAcc, fineDataAcc, coarseGradOutAcc, outFineGradInAcc,
+                            poolingFactor, stride);
+                    };
+                    forEachVoxelCPU<GridType>(fineData.size(1), coarseBatchHdl, cb);
+                }
+            });
     });
-    return outGradInReshape.reshape(spliceShape({fineData.size(0)}, coarseGradOut));
+    return outGradInReshape.reshape(spliceShape({ fineData.size(0) }, coarseGradOut));
 }
 
-
 template <>
-torch::Tensor dispatchDownsampleGridMaxPool<torch::kCUDA>(const GridBatchImpl& fineBatchHdl,
-                                                        const GridBatchImpl& coarseBatchHdl,
-                                                        const torch::Tensor& fineData,
-                                                        nanovdb::Coord poolingFactor,
-                                                        nanovdb::Coord stride) {
-    return DownsampleGridMaxPool<torch::kCUDA>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridMaxPool<torch::kCUDA>(const GridBatchImpl &fineBatchHdl,
+                                            const GridBatchImpl &coarseBatchHdl,
+                                            const torch::Tensor &fineData,
+                                            nanovdb::Coord poolingFactor, nanovdb::Coord stride) {
+    return DownsampleGridMaxPool<torch::kCUDA>(fineBatchHdl, coarseBatchHdl, fineData,
+                                               poolingFactor, stride);
 }
 
 template <>
-torch::Tensor dispatchDownsampleGridMaxPool<torch::kCPU>(const GridBatchImpl& fineBatchHdl,
-                                                       const GridBatchImpl& coarseBatchHdl,
-                                                       const torch::Tensor& fineData,
-                                                       nanovdb::Coord poolingFactor,
-                                                       nanovdb::Coord stride) {
-    return DownsampleGridMaxPool<torch::kCPU>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridMaxPool<torch::kCPU>(const GridBatchImpl &fineBatchHdl,
+                                           const GridBatchImpl &coarseBatchHdl,
+                                           const torch::Tensor &fineData,
+                                           nanovdb::Coord poolingFactor, nanovdb::Coord stride) {
+    return DownsampleGridMaxPool<torch::kCPU>(fineBatchHdl, coarseBatchHdl, fineData, poolingFactor,
+                                              stride);
 }
 
-
 template <>
-torch::Tensor dispatchDownsampleGridMaxPoolBackward<torch::kCUDA>(const GridBatchImpl& coarseBatchHdl,
-                                                                const GridBatchImpl& fineBatchHdl,
-                                                                const torch::Tensor& fineData,
-                                                                const torch::Tensor& coarseGradOut,
-                                                                nanovdb::Coord poolingFactor,
-                                                                nanovdb::Coord stride) {
-    return DownsampleGridMaxPoolBackward<torch::kCUDA>(coarseBatchHdl, fineBatchHdl, fineData, coarseGradOut, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridMaxPoolBackward<torch::kCUDA>(const GridBatchImpl &coarseBatchHdl,
+                                                    const GridBatchImpl &fineBatchHdl,
+                                                    const torch::Tensor &fineData,
+                                                    const torch::Tensor &coarseGradOut,
+                                                    nanovdb::Coord       poolingFactor,
+                                                    nanovdb::Coord       stride) {
+    return DownsampleGridMaxPoolBackward<torch::kCUDA>(coarseBatchHdl, fineBatchHdl, fineData,
+                                                       coarseGradOut, poolingFactor, stride);
 }
 
 template <>
-torch::Tensor dispatchDownsampleGridMaxPoolBackward<torch::kCPU>(const GridBatchImpl& coarseBatchHdl,
-                                                               const GridBatchImpl& fineBatchHdl,
-                                                               const torch::Tensor& fineData,
-                                                               const torch::Tensor& coarseGradOut,
-                                                               nanovdb::Coord poolingFactor,
-                                                               nanovdb::Coord stride) {
-    return DownsampleGridMaxPoolBackward<torch::kCPU>(coarseBatchHdl, fineBatchHdl, fineData, coarseGradOut, poolingFactor, stride);
+torch::Tensor
+dispatchDownsampleGridMaxPoolBackward<torch::kCPU>(const GridBatchImpl &coarseBatchHdl,
+                                                   const GridBatchImpl &fineBatchHdl,
+                                                   const torch::Tensor &fineData,
+                                                   const torch::Tensor &coarseGradOut,
+                                                   nanovdb::Coord       poolingFactor,
+                                                   nanovdb::Coord       stride) {
+    return DownsampleGridMaxPoolBackward<torch::kCPU>(coarseBatchHdl, fineBatchHdl, fineData,
+                                                      coarseGradOut, poolingFactor, stride);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/EnabledMask.cu b/fvdb/src/detail/ops/EnabledMask.cu
index 230ec144dc..dc32cc1f36 100644
--- a/fvdb/src/detail/ops/EnabledMask.cu
+++ b/fvdb/src/detail/ops/EnabledMask.cu
@@ -1,11 +1,10 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
@@ -13,17 +12,18 @@ namespace ops {
 
 /// @brief Per-voxel callback for getting the enabled state of every voxel in a batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void enabledMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
-                                            GridBatchImpl::Accessor<GridType> gridAccessor,
-                                            TorchAccessor<bool, 1> outEnabledMask,
-                                            bool returnDisabled) {
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+enabledMaskCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                    GridBatchImpl::Accessor<GridType> gridAccessor,
+                    TorchAccessor<bool, 1> outEnabledMask, bool returnDisabled) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
 
     const int64_t baseOffset = gridAccessor.voxelOffset(batchIdx);
-    const bool enabled = leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
+    const bool    enabled    = leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
     if (leaf.isActive(voxelIdx)) {
-        const int64_t outIdx = baseOffset + (int64_t) leaf.getValue(voxelIdx) - 1;
+        const int64_t outIdx = baseOffset + (int64_t)leaf.getValue(voxelIdx) - 1;
         if (returnDisabled) {
             outEnabledMask[outIdx] = !enabled;
         } else {
@@ -32,28 +32,35 @@ __hostdev__ inline void enabledMaskCallback(int32_t batchIdx, int32_t leafIdx, i
     }
 }
 
-
-/// @brief Return a boolean mask for every voxel in the grid batch indicating if it is enabled or not
+/// @brief Return a boolean mask for every voxel in the grid batch indicating if it is enabled or
+/// not
 /// @tparam DeviceTag The device to run the kernel on
 /// @param gridBatch The batch of grids to get the enabled mask for
 /// @param returnDisabled If true, return a mask of disabled voxels instead of enabled voxels
 /// @return A boolean mask for every voxel in the grid batch indicating if it is enabled or not
 template <c10::DeviceType DeviceTag>
-JaggedTensor EnabledMask(const GridBatchImpl& batchHdl, bool returnDisabled) {
+JaggedTensor
+EnabledMask(const GridBatchImpl &batchHdl, bool returnDisabled) {
     return FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() -> JaggedTensor {
         batchHdl.checkNonEmptyGrid();
 
-        torch::Tensor outMask = torch::empty({batchHdl.totalVoxels()}, torch::TensorOptions().device(batchHdl.device()).dtype(torch::kBool));
+        torch::Tensor outMask =
+            torch::empty({ batchHdl.totalVoxels() },
+                         torch::TensorOptions().device(batchHdl.device()).dtype(torch::kBool));
         auto outMaskAcc = tensorAccessor<DeviceTag, bool, 1>(outMask);
 
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-                enabledMaskCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, outMaskAcc, returnDisabled);
+            auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                                     GridBatchImpl::Accessor<GridType> gridAccessor) {
+                enabledMaskCallback<GridType, TorchRAcc32>(
+                    batchIdx, leafIdx, voxelIdx, gridAccessor, outMaskAcc, returnDisabled);
             };
             forEachVoxelCUDA<GridType>(1024, 1, batchHdl, cb);
         } else {
-            auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-                enabledMaskCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, outMaskAcc, returnDisabled);
+            auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                          GridBatchImpl::Accessor<GridType> gridAccessor) {
+                enabledMaskCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor,
+                                                        outMaskAcc, returnDisabled);
             };
             forEachVoxelCPU<GridType>(1, batchHdl, cb);
         }
@@ -62,18 +69,18 @@ JaggedTensor EnabledMask(const GridBatchImpl& batchHdl, bool returnDisabled) {
     });
 }
 
-
 template <>
-JaggedTensor dispatchEnabledMask<torch::kCUDA>(const GridBatchImpl& batchHdl, bool returnDisabled) {
+JaggedTensor
+dispatchEnabledMask<torch::kCUDA>(const GridBatchImpl &batchHdl, bool returnDisabled) {
     return EnabledMask<torch::kCUDA>(batchHdl, returnDisabled);
 }
 
 template <>
-JaggedTensor dispatchEnabledMask<torch::kCPU>(const GridBatchImpl& batchHdl, bool returnDisabled) {
+JaggedTensor
+dispatchEnabledMask<torch::kCPU>(const GridBatchImpl &batchHdl, bool returnDisabled) {
     return EnabledMask<torch::kCPU>(batchHdl, returnDisabled);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/FillToGrid.cu b/fvdb/src/detail/ops/FillToGrid.cu
index 2226c9ffd9..f95792bdc8 100644
--- a/fvdb/src/detail/ops/FillToGrid.cu
+++ b/fvdb/src/detail/ops/FillToGrid.cu
@@ -1,76 +1,73 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 template <typename GridType, typename ScalarType>
-__hostdev__ inline void fillToGridVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                   GridBatchImpl::Accessor<GridType> fromGridHdl,
-                                                    GridBatchImpl::Accessor<GridType> toGridHdl,
-                                                    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> fromFeatures,
-                                                    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> toFeatures) {
-
+__hostdev__ inline void
+fillToGridVoxelCallback(
+    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+    GridBatchImpl::Accessor<GridType> fromGridHdl, GridBatchImpl::Accessor<GridType> toGridHdl,
+    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> fromFeatures,
+    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> toFeatures) {
     using LeafNodeT = typename nanovdb::NanoGrid<GridType>::LeafNodeType;
 
-    const nanovdb::NanoGrid<GridType>* gpuFromGrid = fromGridHdl.grid(batchIdx);
-    const int64_t fromBaseOffset = fromGridHdl.voxelOffset(batchIdx);
+    const nanovdb::NanoGrid<GridType> *gpuFromGrid    = fromGridHdl.grid(batchIdx);
+    const int64_t                      fromBaseOffset = fromGridHdl.voxelOffset(batchIdx);
 
-    const LeafNodeT& fromLeaf = gpuFromGrid->tree().template getFirstNode<0>()[leafIdx];
-    const nanovdb::Coord voxIjk = fromLeaf.offsetToGlobalCoord(voxelIdx);
-    const bool isFromActive = fromLeaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
+    const LeafNodeT     &fromLeaf     = gpuFromGrid->tree().template getFirstNode<0>()[leafIdx];
+    const nanovdb::Coord voxIjk       = fromLeaf.offsetToGlobalCoord(voxelIdx);
+    const bool           isFromActive = fromLeaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
 
     if (!isFromActive) {
         return;
     }
 
-    const nanovdb::NanoGrid<GridType>* gpuToGrid = toGridHdl.grid(batchIdx);
-    const int64_t toBaseOffset = toGridHdl.voxelOffset(batchIdx);
-    const auto toGridAcc = gpuToGrid->getAccessor();
+    const nanovdb::NanoGrid<GridType> *gpuToGrid    = toGridHdl.grid(batchIdx);
+    const int64_t                      toBaseOffset = toGridHdl.voxelOffset(batchIdx);
+    const auto                         toGridAcc    = gpuToGrid->getAccessor();
     if (!toGridAcc.template get<ActiveOrUnmasked<GridType>>(voxIjk)) {
         return;
     }
 
-    const int64_t toIndex = (int64_t) toGridAcc.getValue(voxIjk) + toBaseOffset - 1;
+    const int64_t toIndex   = (int64_t)toGridAcc.getValue(voxIjk) + toBaseOffset - 1;
     const int64_t fromIndex = fromLeaf.getValue(voxelIdx) + fromBaseOffset - 1;
 
     toFeatures[toIndex][channelIdx] = fromFeatures[fromIndex][channelIdx];
 }
 
-
 template <typename GridType, typename ScalarType>
-void fillToGridCPU(const GridBatchImpl::Accessor<GridType>& fromGridHandle,
-                   const GridBatchImpl::Accessor<GridType>& toGridHandle,
-                   const  torch::TensorAccessor<ScalarType, 2> fromFeatures,
-                   torch::TensorAccessor<ScalarType, 2> toFeatures,
-                   bool isContiguous) {
-
+void
+fillToGridCPU(const GridBatchImpl::Accessor<GridType>   &fromGridHandle,
+              const GridBatchImpl::Accessor<GridType>   &toGridHandle,
+              const torch::TensorAccessor<ScalarType, 2> fromFeatures,
+              torch::TensorAccessor<ScalarType, 2> toFeatures, bool isContiguous) {
     for (size_t bi = 0; bi < fromGridHandle.batchSize(); bi += 1) {
-        const nanovdb::NanoGrid<GridType>* fromGrid = fromGridHandle.grid(bi);
-        const nanovdb::NanoGrid<GridType>* toGrid = toGridHandle.grid(bi);
+        const nanovdb::NanoGrid<GridType> *fromGrid = fromGridHandle.grid(bi);
+        const nanovdb::NanoGrid<GridType> *toGrid   = toGridHandle.grid(bi);
 
         const int64_t fromBaseOffset = fromGridHandle.voxelOffset(bi);
-        const int64_t toBaseOffset = toGridHandle.voxelOffset(bi);
+        const int64_t toBaseOffset   = toGridHandle.voxelOffset(bi);
 
-        for (auto it = ActiveVoxelIterator<GridType, -1>(fromGrid->tree(), false, fromBaseOffset); it.isValid(); it++) {
+        for (auto it = ActiveVoxelIterator<GridType, -1>(fromGrid->tree(), false, fromBaseOffset);
+             it.isValid(); it++) {
             const nanovdb::Coord voxIjk = it->first;
 
             if (!toGrid->getAccessor().template get<ActiveOrUnmasked<GridType>>(voxIjk)) {
                 continue;
             }
-            const int64_t toIndex = (int64_t) toGrid->getAccessor().getValue(voxIjk) + toBaseOffset - 1;
+            const int64_t toIndex =
+                (int64_t)toGrid->getAccessor().getValue(voxIjk) + toBaseOffset - 1;
             if (isContiguous) {
-                memcpy(toFeatures[toIndex].data(),
-                    fromFeatures[it->second].data(),
-                    fromFeatures.size(1) * sizeof(ScalarType));
+                memcpy(toFeatures[toIndex].data(), fromFeatures[it->second].data(),
+                       fromFeatures.size(1) * sizeof(ScalarType));
             } else {
                 for (int c = 0; c < toFeatures.size(1); ++c) {
                     toFeatures[toIndex][c] = fromFeatures[it->second][c];
@@ -80,34 +77,31 @@ void fillToGridCPU(const GridBatchImpl::Accessor<GridType>& fromGridHandle,
     }
 }
 
-
 template <>
-void dispatchFillToGrid<torch::kCUDA>(const GridBatchImpl& fromGrid,
-                        const GridBatchImpl& toGrid,
-                        const torch::Tensor& fromFeatures,
-                        torch::Tensor& toFeatures) {
-
+void
+dispatchFillToGrid<torch::kCUDA>(const GridBatchImpl &fromGrid, const GridBatchImpl &toGrid,
+                                 const torch::Tensor &fromFeatures, torch::Tensor &toFeatures) {
     FVDB_DISPATCH_GRID_TYPES(fromGrid, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(fromFeatures.scalar_type(), "fillToGrid", [&]() {
-            auto fromFeaturesAcc = fromFeatures.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
-            auto toFeaturesAcc = toFeatures.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
+            auto fromFeaturesAcc =
+                fromFeatures.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
+            auto toFeaturesAcc =
+                toFeatures.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
             auto toGridAcc = toGrid.deviceAccessor<GridType>();
-            auto callback = [=] __device__ (int64_t bidx, int64_t lidx, int64_t vidx, int64_t cidx, GridBatchImpl::Accessor<GridType> fromGridAcc) {
-                fillToGridVoxelCallback<GridType, scalar_t>(bidx, lidx, vidx, cidx, fromGridAcc, toGridAcc, fromFeaturesAcc, toFeaturesAcc);
+            auto callback  = [=] __device__(int64_t bidx, int64_t lidx, int64_t vidx, int64_t cidx,
+                                            GridBatchImpl::Accessor<GridType> fromGridAcc) {
+                fillToGridVoxelCallback<GridType, scalar_t>(
+                    bidx, lidx, vidx, cidx, fromGridAcc, toGridAcc, fromFeaturesAcc, toFeaturesAcc);
             };
             forEachVoxelCUDA<GridType>(512, fromFeatures.size(1), fromGrid, callback);
-
         });
     });
 }
 
-
 template <>
-void dispatchFillToGrid<torch::kCPU>(const GridBatchImpl& fromGrid,
-                        const GridBatchImpl& toGrid,
-                        const torch::Tensor& fromFeatures,
-                        torch::Tensor& toFeatures) {
-
+void
+dispatchFillToGrid<torch::kCPU>(const GridBatchImpl &fromGrid, const GridBatchImpl &toGrid,
+                                const torch::Tensor &fromFeatures, torch::Tensor &toFeatures) {
     bool isContiguous = fromFeatures.is_contiguous() && toFeatures.is_contiguous();
 
     FVDB_DISPATCH_GRID_TYPES(toGrid, [&]() {
diff --git a/fvdb/src/detail/ops/GridEdgeNetwork.cu b/fvdb/src/detail/ops/GridEdgeNetwork.cu
index 3cd055122c..b0eb26d46e 100644
--- a/fvdb/src/detail/ops/GridEdgeNetwork.cu
+++ b/fvdb/src/detail/ops/GridEdgeNetwork.cu
@@ -1,11 +1,10 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
@@ -13,32 +12,32 @@ namespace ops {
 
 /// @brief Per-voxel callback which computes the active grid coordinates for a batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void gridEdgeNetworkCallback(int64_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
-                                                GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                TorchAccessor<float, 2> outV,
-                                                TorchAccessor<fvdb::JIdxType, 1> outVBidx,
-                                                TorchAccessor<int64_t, 2> outE,
-                                                TorchAccessor<fvdb::JIdxType, 1> outEBidx,
-                                                bool returnVoxelCoordinates) {
-
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+gridEdgeNetworkCallback(int64_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                        GridBatchImpl::Accessor<GridType> gridAccessor,
+                        TorchAccessor<float, 2> outV, TorchAccessor<fvdb::JIdxType, 1> outVBidx,
+                        TorchAccessor<int64_t, 2> outE, TorchAccessor<fvdb::JIdxType, 1> outEBidx,
+                        bool returnVoxelCoordinates) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     const int64_t baseOffset = gridAccessor.voxelOffset(batchIdx);
 
     const VoxelCoordTransform tx = gridAccessor.primalTransform(batchIdx);
 
     const nanovdb::Coord voxIjk = leaf.offsetToGlobalCoord(voxelIdx);
     if (leaf.isActive(voxelIdx)) {
-        int64_t vIdx = (int64_t) leaf.getValue(voxelIdx) - 1;
+        int64_t       vIdx      = (int64_t)leaf.getValue(voxelIdx) - 1;
         const int64_t globalIdx = baseOffset + vIdx;
-        const int64_t countV = globalIdx * 8;
-        const int64_t countE = globalIdx * 12;
+        const int64_t countV    = globalIdx * 8;
+        const int64_t countE    = globalIdx * 12;
 
         for (int idx = 0; idx < 8; idx += 1) {
-            const int32_t iz((idx & 1));
-            const int32_t iy((idx & 2) >> 1);
-            const int32_t ix((idx & 4) >> 2);
-            nanovdb::Vec3f xyz = (voxIjk + nanovdb::Coord(ix, iy, iz)).asVec3s() - nanovdb::Vec3f(0.5);
+            const int32_t  iz((idx & 1));
+            const int32_t  iy((idx & 2) >> 1);
+            const int32_t  ix((idx & 4) >> 2);
+            nanovdb::Vec3f xyz =
+                (voxIjk + nanovdb::Coord(ix, iy, iz)).asVec3s() - nanovdb::Vec3f(0.5);
             if (!returnVoxelCoordinates) {
                 xyz = tx.applyInv(xyz);
             }
@@ -49,20 +48,32 @@ __hostdev__ inline void gridEdgeNetworkCallback(int64_t batchIdx, int32_t leafId
             outVBidx[countV + idx] = batchIdx;
         }
 
-        const int64_t eBase = countV - baseOffset * 8;
-        outE[countE + 0][0] = 0 + eBase; outE[countE + 0][1] = 1 + eBase;
-        outE[countE + 1][0] = 0 + eBase; outE[countE + 1][1] = 2 + eBase;
-        outE[countE + 2][0] = 0 + eBase; outE[countE + 2][1] = 4 + eBase;
-        outE[countE + 3][0] = 2 + eBase; outE[countE + 3][1] = 3 + eBase;
-        outE[countE + 4][0] = 2 + eBase; outE[countE + 4][1] = 6 + eBase;
-        outE[countE + 5][0] = 3 + eBase; outE[countE + 5][1] = 7 + eBase;
-        outE[countE + 6][0] = 3 + eBase; outE[countE + 6][1] = 1 + eBase;
-        outE[countE + 7][0] = 7 + eBase; outE[countE + 7][1] = 6 + eBase;
-        outE[countE + 8][0] = 6 + eBase; outE[countE + 8][1] = 4 + eBase;
-        outE[countE + 9][0] = 7 + eBase; outE[countE + 9][1] = 5 + eBase;
-        outE[countE + 10][0] = 5 + eBase; outE[countE + 10][1] = 4 + eBase;
-        outE[countE + 11][0] = 1 + eBase; outE[countE + 11][1] = 5 + eBase;
-        #pragma unroll
+        const int64_t eBase  = countV - baseOffset * 8;
+        outE[countE + 0][0]  = 0 + eBase;
+        outE[countE + 0][1]  = 1 + eBase;
+        outE[countE + 1][0]  = 0 + eBase;
+        outE[countE + 1][1]  = 2 + eBase;
+        outE[countE + 2][0]  = 0 + eBase;
+        outE[countE + 2][1]  = 4 + eBase;
+        outE[countE + 3][0]  = 2 + eBase;
+        outE[countE + 3][1]  = 3 + eBase;
+        outE[countE + 4][0]  = 2 + eBase;
+        outE[countE + 4][1]  = 6 + eBase;
+        outE[countE + 5][0]  = 3 + eBase;
+        outE[countE + 5][1]  = 7 + eBase;
+        outE[countE + 6][0]  = 3 + eBase;
+        outE[countE + 6][1]  = 1 + eBase;
+        outE[countE + 7][0]  = 7 + eBase;
+        outE[countE + 7][1]  = 6 + eBase;
+        outE[countE + 8][0]  = 6 + eBase;
+        outE[countE + 8][1]  = 4 + eBase;
+        outE[countE + 9][0]  = 7 + eBase;
+        outE[countE + 9][1]  = 5 + eBase;
+        outE[countE + 10][0] = 5 + eBase;
+        outE[countE + 10][1] = 4 + eBase;
+        outE[countE + 11][0] = 1 + eBase;
+        outE[countE + 11][1] = 5 + eBase;
+#pragma unroll
         for (int i = 0; i < 12; i += 1) {
             outEBidx[countE + i] = batchIdx;
         }
@@ -70,62 +81,71 @@ __hostdev__ inline void gridEdgeNetworkCallback(int64_t batchIdx, int32_t leafId
 }
 
 template <c10::DeviceType DeviceTag>
-std::vector<JaggedTensor> GridEdgeNetwork(const GridBatchImpl& batchHdl, bool returnVoxelCoordinates) {
+std::vector<JaggedTensor>
+GridEdgeNetwork(const GridBatchImpl &batchHdl, bool returnVoxelCoordinates) {
     batchHdl.checkNonEmptyGrid();
 
     return FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() -> std::vector<JaggedTensor> {
         const int64_t numUnmasked = batchHdl.totalEnabledVoxels(false /*ignoreDisabledVoxels*/);
 
         auto optsV = torch::TensorOptions().dtype(torch::kFloat32).device(batchHdl.device());
-        torch::Tensor outV = torch::empty({8 * numUnmasked, 3}, optsV);
+        torch::Tensor outV = torch::empty({ 8 * numUnmasked, 3 }, optsV);
 
-        auto optsE = torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
-        torch::Tensor outE = torch::empty({12 * numUnmasked, 2}, optsE);
+        auto          optsE = torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
+        torch::Tensor outE  = torch::empty({ 12 * numUnmasked, 2 }, optsE);
 
-        auto optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-        torch::Tensor outVBidx = torch::empty({8 * numUnmasked}, optsBIdx);
-        torch::Tensor outEBidx = torch::empty({12 * numUnmasked}, optsBIdx);
+        auto optsBIdx =
+            torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+        torch::Tensor outVBidx = torch::empty({ 8 * numUnmasked }, optsBIdx);
+        torch::Tensor outEBidx = torch::empty({ 12 * numUnmasked }, optsBIdx);
 
-        auto outVAcc = tensorAccessor<DeviceTag, float, 2>(outV);
+        auto outVAcc     = tensorAccessor<DeviceTag, float, 2>(outV);
         auto outVBidxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outVBidx);
-        auto outEAcc = tensorAccessor<DeviceTag, int64_t, 2>(outE);
+        auto outEAcc     = tensorAccessor<DeviceTag, int64_t, 2>(outE);
         auto outEBidxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outEBidx);
 
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-                gridEdgeNetworkCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, outVAcc, outVBidxAcc, outEAcc, outEBidxAcc, returnVoxelCoordinates);
+            auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                                     GridBatchImpl::Accessor<GridType> gridAccessor) {
+                gridEdgeNetworkCallback<GridType, TorchRAcc32>(
+                    batchIdx, leafIdx, voxelIdx, gridAccessor, outVAcc, outVBidxAcc, outEAcc,
+                    outEBidxAcc, returnVoxelCoordinates);
             };
             forEachVoxelCUDA<GridType>(1024, 1, batchHdl, cb);
         } else {
-            auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-                gridEdgeNetworkCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, outVAcc, outVBidxAcc, outEAcc, outEBidxAcc, returnVoxelCoordinates);
+            auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                          GridBatchImpl::Accessor<GridType> gridAccessor) {
+                gridEdgeNetworkCallback<GridType, TorchAcc>(
+                    batchIdx, leafIdx, voxelIdx, gridAccessor, outVAcc, outVBidxAcc, outEAcc,
+                    outEBidxAcc, returnVoxelCoordinates);
             };
             forEachVoxelCPU<GridType>(1, batchHdl, cb);
         }
 
         // FIXME: (@fwilliams) Be smarter about this
-        const torch::Tensor outVBidx2 = batchHdl.batchSize() == 1 ? torch::empty({0}, optsBIdx) : outVBidx;
-        const torch::Tensor outEBidx2 = batchHdl.batchSize() == 1 ? torch::empty({0}, optsBIdx) : outEBidx;
-        return {
-            JaggedTensor::from_data_indices_and_list_ids(outV, outVBidx2, batchHdl.jlidx(), batchHdl.batchSize()),
-            JaggedTensor::from_data_indices_and_list_ids(outE, outEBidx2, batchHdl.jlidx(), batchHdl.batchSize())
-        };
+        const torch::Tensor outVBidx2 =
+            batchHdl.batchSize() == 1 ? torch::empty({ 0 }, optsBIdx) : outVBidx;
+        const torch::Tensor outEBidx2 =
+            batchHdl.batchSize() == 1 ? torch::empty({ 0 }, optsBIdx) : outEBidx;
+        return { JaggedTensor::from_data_indices_and_list_ids(outV, outVBidx2, batchHdl.jlidx(),
+                                                              batchHdl.batchSize()),
+                 JaggedTensor::from_data_indices_and_list_ids(outE, outEBidx2, batchHdl.jlidx(),
+                                                              batchHdl.batchSize()) };
     });
 }
 
-
 template <>
-std::vector<JaggedTensor> dispatchGridEdgeNetwork<torch::kCUDA>(const GridBatchImpl& gridHdl, bool returnVoxelCoordinates) {
+std::vector<JaggedTensor>
+dispatchGridEdgeNetwork<torch::kCUDA>(const GridBatchImpl &gridHdl, bool returnVoxelCoordinates) {
     return GridEdgeNetwork<torch::kCUDA>(gridHdl, returnVoxelCoordinates);
 }
 
-
 template <>
-std::vector<JaggedTensor> dispatchGridEdgeNetwork<torch::kCPU>(const GridBatchImpl& gridHdl, bool returnVoxelCoordinates) {
+std::vector<JaggedTensor>
+dispatchGridEdgeNetwork<torch::kCPU>(const GridBatchImpl &gridHdl, bool returnVoxelCoordinates) {
     return GridEdgeNetwork<torch::kCPU>(gridHdl, returnVoxelCoordinates);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/IjkToIndex.cu b/fvdb/src/detail/ops/IjkToIndex.cu
index 6a05afa7ea..0bbe5e9782 100644
--- a/fvdb/src/detail/ops/IjkToIndex.cu
+++ b/fvdb/src/detail/ops/IjkToIndex.cu
@@ -1,26 +1,25 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename GridType, typename ScalarType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void ijkToIndexCallback(int32_t bidx, int32_t eidx,
-                                           BatchGridAccessor<GridType> batchAccessor,
-                                           const JaggedAccessor<ScalarType, 2> ijk,
-                                           TensorAccessor<int64_t, 1> outIndex,
-                                           bool cumulative) {
-
-    const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(bidx);
-    const auto acc = grid->getAccessor();
-    const auto ijkCoord = ijk.data()[eidx];
-    const nanovdb::Coord vox(ijkCoord[0], ijkCoord[1], ijkCoord[2]);
+template <typename GridType, typename ScalarType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+ijkToIndexCallback(int32_t bidx, int32_t eidx, BatchGridAccessor<GridType> batchAccessor,
+                   const JaggedAccessor<ScalarType, 2> ijk, TensorAccessor<int64_t, 1> outIndex,
+                   bool cumulative) {
+    const nanovdb::NanoGrid<GridType> *grid     = batchAccessor.grid(bidx);
+    const auto                         acc      = grid->getAccessor();
+    const auto                         ijkCoord = ijk.data()[eidx];
+    const nanovdb::Coord               vox(ijkCoord[0], ijkCoord[1], ijkCoord[2]);
     const int64_t baseOffset = cumulative ? batchAccessor.voxelOffset(bidx) : 0;
     if (acc.isActive(vox)) {
         outIndex[eidx] = acc.getValue(vox) - 1 + baseOffset;
@@ -29,34 +28,37 @@ __hostdev__ inline void ijkToIndexCallback(int32_t bidx, int32_t eidx,
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor IjkToIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+IjkToIndex(const GridBatchImpl &batchHdl, const JaggedTensor &ijk, bool cumulative) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(ijk);
     TORCH_CHECK_TYPE(at::isIntegralType(ijk.scalar_type(), false), "ijk must have an integer type");
-    TORCH_CHECK(ijk.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                std::to_string(ijk.rdim()) + " dimensions");
-    TORCH_CHECK(ijk.rsize(1) == 3,
-                "Expected 3 dimensional ijk but got points.shape[1] = " +
-                std::to_string(ijk.rsize(1)));
+    TORCH_CHECK(ijk.rdim() == 2,
+                std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(ijk.rdim()) + " dimensions");
+    TORCH_CHECK(ijk.rsize(1) == 3, "Expected 3 dimensional ijk but got points.shape[1] = " +
+                                       std::to_string(ijk.rsize(1)));
 
-    auto opts = torch::TensorOptions().dtype(torch::kLong).device(ijk.device());
-    torch::Tensor outIndex = torch::empty({ijk.rsize(0)}, opts);
+    auto          opts     = torch::TensorOptions().dtype(torch::kLong).device(ijk.device());
+    torch::Tensor outIndex = torch::empty({ ijk.rsize(0) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_INTEGRAL_TYPES(ijk.scalar_type(), "IjkToIndex", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc    = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outIndexAcc = tensorAccessor<DeviceTag, int64_t, 1>(outIndex);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ijkAcc) {
-                    ijkToIndexCallback<GridType, scalar_t, JaggedRAcc32, TorchRAcc32>(bidx, eidx, batchAcc, ijkAcc, outIndexAcc, cumulative);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ijkAcc) {
+                    ijkToIndexCallback<GridType, scalar_t, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, batchAcc, ijkAcc, outIndexAcc, cumulative);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, ijk, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ijkAcc) {
-                    ijkToIndexCallback<GridType, scalar_t, JaggedAcc, TorchAcc>(bidx, eidx, batchAcc, ijkAcc, outIndexAcc, cumulative);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ijkAcc) {
+                    ijkToIndexCallback<GridType, scalar_t, JaggedAcc, TorchAcc>(
+                        bidx, eidx, batchAcc, ijkAcc, outIndexAcc, cumulative);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, ijk, cb);
             }
@@ -66,18 +68,20 @@ JaggedTensor IjkToIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ijk,
     return ijk.jagged_like(outIndex);
 }
 
-
 template <>
-JaggedTensor dispatchIjkToIndex<torch::kCUDA>(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+dispatchIjkToIndex<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                 bool cumulative) {
     return IjkToIndex<torch::kCUDA>(batchHdl, ijk, cumulative);
 }
 
 template <>
-JaggedTensor dispatchIjkToIndex<torch::kCPU>(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+dispatchIjkToIndex<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                bool cumulative) {
     return IjkToIndex<torch::kCPU>(batchHdl, ijk, cumulative);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/IjkToInvIndex.cu b/fvdb/src/detail/ops/IjkToInvIndex.cu
index 1945bc372c..9c2d55f468 100644
--- a/fvdb/src/detail/ops/IjkToInvIndex.cu
+++ b/fvdb/src/detail/ops/IjkToInvIndex.cu
@@ -1,63 +1,66 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename GridType, typename ScalarType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void ijkToInvIndexCallback(int32_t bidx, int32_t eidx,
-                                              BatchGridAccessor<GridType> batchAccessor,
-                                              const JaggedAccessor<ScalarType, 2> ijk,
-                                              TensorAccessor<int64_t, 1> outInvIndex,
-                                              bool cumulative) {
-
-    const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(bidx);
-    const auto acc = grid->getAccessor();
-    const auto ijkCoord = ijk.data()[eidx];
-    const nanovdb::Coord vox(ijkCoord[0], ijkCoord[1], ijkCoord[2]);
+template <typename GridType, typename ScalarType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+ijkToInvIndexCallback(int32_t bidx, int32_t eidx, BatchGridAccessor<GridType> batchAccessor,
+                      const JaggedAccessor<ScalarType, 2> ijk,
+                      TensorAccessor<int64_t, 1> outInvIndex, bool cumulative) {
+    const nanovdb::NanoGrid<GridType> *grid     = batchAccessor.grid(bidx);
+    const auto                         acc      = grid->getAccessor();
+    const auto                         ijkCoord = ijk.data()[eidx];
+    const nanovdb::Coord               vox(ijkCoord[0], ijkCoord[1], ijkCoord[2]);
     if (acc.isActive(vox)) {
         const int64_t baseOffset = cumulative ? 0 : batchAccessor.voxelOffset(bidx);
-        const int64_t index = (int64_t) acc.getValue(vox) - 1 + batchAccessor.voxelOffset(bidx);
-        outInvIndex[index] = eidx - baseOffset;
+        const int64_t index      = (int64_t)acc.getValue(vox) - 1 + batchAccessor.voxelOffset(bidx);
+        outInvIndex[index]       = eidx - baseOffset;
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor IjkToInvIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+IjkToInvIndex(const GridBatchImpl &batchHdl, const JaggedTensor &ijk, bool cumulative) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(ijk);
     TORCH_CHECK_TYPE(at::isIntegralType(ijk.scalar_type(), false), "ijk must have an integer type");
-    TORCH_CHECK(ijk.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                   std::to_string(ijk.rdim()) + " dimensions");
+    TORCH_CHECK(ijk.rdim() == 2,
+                std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(ijk.rdim()) + " dimensions");
     TORCH_CHECK(ijk.rsize(0) > 0, "Empty tensor (ijk)");
-    TORCH_CHECK(ijk.rsize(1) == 3,
-                "Expected 3 dimensional ijk but got points.shape[1] = " +
-                std::to_string(ijk.rsize(1)));
+    TORCH_CHECK(ijk.rsize(1) == 3, "Expected 3 dimensional ijk but got points.shape[1] = " +
+                                       std::to_string(ijk.rsize(1)));
 
-    auto opts = torch::TensorOptions().dtype(torch::kLong).device(ijk.device());
-    torch::Tensor outInvIndex = torch::full({batchHdl.totalVoxels()}, at::Scalar((int64_t) -1), opts);
+    auto          opts = torch::TensorOptions().dtype(torch::kLong).device(ijk.device());
+    torch::Tensor outInvIndex =
+        torch::full({ batchHdl.totalVoxels() }, at::Scalar((int64_t)-1), opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_INTEGRAL_TYPES(ijk.scalar_type(), "IjkToInvIndex", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc       = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outInvIndexAcc = tensorAccessor<DeviceTag, int64_t, 1>(outInvIndex);
 
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ijkAcc) {
-                    ijkToInvIndexCallback<GridType, scalar_t, JaggedRAcc32, TorchRAcc32>(bidx, eidx, batchAcc, ijkAcc, outInvIndexAcc, cumulative);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ijkAcc) {
+                    ijkToInvIndexCallback<GridType, scalar_t, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, batchAcc, ijkAcc, outInvIndexAcc, cumulative);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, ijk, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ijkAcc) {
-                    ijkToInvIndexCallback<GridType, scalar_t, JaggedAcc, TorchAcc>(bidx, eidx, batchAcc, ijkAcc, outInvIndexAcc, cumulative);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ijkAcc) {
+                    ijkToInvIndexCallback<GridType, scalar_t, JaggedAcc, TorchAcc>(
+                        bidx, eidx, batchAcc, ijkAcc, outInvIndexAcc, cumulative);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, ijk, cb);
             }
@@ -67,14 +70,17 @@ JaggedTensor IjkToInvIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ij
     return batchHdl.jaggedTensor(outInvIndex, false);
 }
 
-
 template <>
-JaggedTensor dispatchIjkToInvIndex<torch::kCUDA>(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+dispatchIjkToInvIndex<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                    bool cumulative) {
     return IjkToInvIndex<torch::kCUDA>(batchHdl, ijk, cumulative);
 }
 
 template <>
-JaggedTensor dispatchIjkToInvIndex<torch::kCPU>(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative) {
+JaggedTensor
+dispatchIjkToInvIndex<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                   bool cumulative) {
     return IjkToInvIndex<torch::kCPU>(batchHdl, ijk, cumulative);
 }
 
diff --git a/fvdb/src/detail/ops/JCat0.cu b/fvdb/src/detail/ops/JCat0.cu
index 9054e6d443..f140543b2f 100644
--- a/fvdb/src/detail/ops/JCat0.cu
+++ b/fvdb/src/detail/ops/JCat0.cu
@@ -1,22 +1,22 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include "Ops.h"
 
+#include <detail/utils/Utils.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <ATen/cuda/Atomic.cuh>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
-#include "detail/utils/cuda/Utils.cuh"
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-__global__ void computeTensorSizes(const JOffsetsType* __restrict__ const* __restrict__ offsets,
-                                   const size_t numOffsets, TorchRAcc32<JOffsetsType, 1> outTensorSizes) {
+__global__ void
+computeTensorSizes(const JOffsetsType *__restrict__ const *__restrict__ offsets,
+                   const size_t numOffsets, TorchRAcc32<JOffsetsType, 1> outTensorSizes) {
     int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx > outTensorSizes.size(0) - 1) {
@@ -28,7 +28,7 @@ __global__ void computeTensorSizes(const JOffsetsType* __restrict__ const* __res
     for (size_t i = 0; i < numOffsets; ++i) {
         tensorSize += offsets[i][idx + 1] - offsets[i][idx];
     }
-    outTensorSizes[idx+1] = tensorSize;
+    outTensorSizes[idx + 1] = tensorSize;
 
     // One thread will write out the zero in the begining
     if (idx == 0) {
@@ -37,72 +37,91 @@ __global__ void computeTensorSizes(const JOffsetsType* __restrict__ const* __res
 }
 
 template <typename IdxT>
-__global__ void computeIndexPutArg(const size_t jti, const JOffsetsType* __restrict__ const* __restrict__ offsets, const size_t numOffsets,
-                                   const TorchRAcc32<JIdxType, 1> inJIdxI,                // Jidx of the i^th input tensor
-                                   const TorchRAcc32<JOffsetsType, 1> inJoffsetsI,        // JOffsets of the i^th input tensor
-                                   const TorchRAcc32<JOffsetsType, 1> outJOffsets,        // Output JOffsets (already computed earlier)
-                                   TorchRAcc32<IdxT, 1> outSelIdx,                        // Output selection indices
-                                   TorchRAcc32<JIdxType, 1> outJIdx) {                    // Output Jidx
-    int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+__global__ void
+computeIndexPutArg(
+    const size_t jti, const JOffsetsType *__restrict__ const *__restrict__ offsets,
+    const size_t numOffsets,
+    const TorchRAcc32<JIdxType, 1>     inJIdxI,     // Jidx of the i^th input tensor
+    const TorchRAcc32<JOffsetsType, 1> inJoffsetsI, // JOffsets of the i^th input tensor
+    const TorchRAcc32<JOffsetsType, 1> outJOffsets, // Output JOffsets (already computed earlier)
+    TorchRAcc32<IdxT, 1>               outSelIdx,   // Output selection indices
+    TorchRAcc32<JIdxType, 1>           outJIdx) {             // Output Jidx
+    int32_t       idx         = blockIdx.x * blockDim.x + threadIdx.x;
     const int64_t numElements = inJIdxI.size(0);
 
     if (idx >= numElements) {
         return;
     }
 
-    const JIdxType jidx = inJIdxI[idx];              // Which tensor this element belongs to
+    const JIdxType jidx = inJIdxI[idx]; // Which tensor this element belongs to
 
     // Where in the output tensor we're going to write to
     JOffsetsType tensorWriteOffset = 0;
-    for (size_t i = 0; i < jti; ++i){
+    for (size_t i = 0; i < jti; ++i) {
         tensorWriteOffset += offsets[i][jidx + 1] - offsets[i][jidx];
     }
 
-    const JOffsetsType writeBaseOffset = outJOffsets[jidx];                   // Start of the concatenated tensor in the output
-    const JOffsetsType tensorOffsetOut = writeBaseOffset + tensorWriteOffset; // Start of where we're copying the input tensor to in the output
-    const JOffsetsType tensorOffsetIn = inJoffsetsI[jidx];                    // Start of the tensor in the input
-    const JOffsetsType elementOffsetInTensor = idx - tensorOffsetIn;          // Which element in the tensor we are copying
-
-    outSelIdx[idx] = elementOffsetInTensor + tensorOffsetOut;                 // Which element in the output the current input element will go to
-    outJIdx[elementOffsetInTensor] = jidx;                                    // Which tensor the current element belongs to in the output
+    const JOffsetsType writeBaseOffset =
+        outJOffsets[jidx]; // Start of the concatenated tensor in the output
+    const JOffsetsType tensorOffsetOut =
+        writeBaseOffset +
+        tensorWriteOffset; // Start of where we're copying the input tensor to in the output
+    const JOffsetsType tensorOffsetIn = inJoffsetsI[jidx]; // Start of the tensor in the input
+    const JOffsetsType elementOffsetInTensor =
+        idx - tensorOffsetIn; // Which element in the tensor we are copying
+
+    outSelIdx[idx] =
+        elementOffsetInTensor +
+        tensorOffsetOut; // Which element in the output the current input element will go to
+    outJIdx[elementOffsetInTensor] =
+        jidx;            // Which tensor the current element belongs to in the output
 }
 
-
 template <>
-JaggedTensor dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor>& vec) {
-    int64_t totalElements = 0;
-    int64_t maxElements = 0;
-    thrust::host_vector<JOffsetsType*> offsets;
+JaggedTensor
+dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor> &vec) {
+    int64_t                             totalElements = 0;
+    int64_t                             maxElements   = 0;
+    thrust::host_vector<JOffsetsType *> offsets;
     offsets.reserve(vec.size());
 
-    auto size_0 = vec[0].jdata().sizes();
+    auto size_0  = vec[0].jdata().sizes();
     auto dtype_0 = vec[0].jdata().dtype();
-    for (auto& jt : vec) {
+    for (auto &jt: vec) {
         totalElements += jt.jdata().size(0);
         maxElements = std::max(maxElements, jt.jdata().size(0));
         offsets.push_back(jt.joffsets().data_ptr<JOffsetsType>());
-        TORCH_CHECK_VALUE(jt.joffsets().size(0) == vec[0].joffsets().size(0), "All jagged tensors must have the same number of tensors");
-        TORCH_CHECK_VALUE(jt.joffsets().is_contiguous(), "All jagged tensors must have contiguous offsets");
+        TORCH_CHECK_VALUE(jt.joffsets().size(0) == vec[0].joffsets().size(0),
+                          "All jagged tensors must have the same number of tensors");
+        TORCH_CHECK_VALUE(jt.joffsets().is_contiguous(),
+                          "All jagged tensors must have contiguous offsets");
         TORCH_CHECK_VALUE(jt.device().is_cuda(), "All jagged tensors must be on the same device");
         TORCH_CHECK(jt.jdata().dtype() == dtype_0, "All jagged tensors must have the same dtype");
         auto sizes_i = jt.jdata().sizes();
         for (size_t i = 1; i < sizes_i.size(); ++i) {
-            TORCH_CHECK_VALUE(sizes_i[i] == size_0[i], "All jagged tensors must have the same eshape");
+            TORCH_CHECK_VALUE(sizes_i[i] == size_0[i],
+                              "All jagged tensors must have the same eshape");
         }
     }
 
-    thrust::device_vector<JOffsetsType*> offsets_d = offsets;
-    torch::Tensor outJOffsets = torch::empty({vec[0].joffsets().size(0)}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCUDA));
+    thrust::device_vector<JOffsetsType *> offsets_d = offsets;
+    torch::Tensor                         outJOffsets =
+        torch::empty({ vec[0].joffsets().size(0) },
+                     torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCUDA));
     const int64_t numThreadsCalcTensorSizes = 1024;
-    const int64_t numBlocksCalcTensorSizes = GET_BLOCKS(outJOffsets.size(0), numThreadsCalcTensorSizes);
+    const int64_t numBlocksCalcTensorSizes =
+        GET_BLOCKS(outJOffsets.size(0), numThreadsCalcTensorSizes);
     computeTensorSizes<<<numBlocksCalcTensorSizes, numThreadsCalcTensorSizes>>>(
-        thrust::raw_pointer_cast(offsets_d.data()), offsets_d.size(), outJOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>());
+        thrust::raw_pointer_cast(offsets_d.data()), offsets_d.size(),
+        outJOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     torch::cumsum_out(outJOffsets, outJOffsets, 0);
 
-    auto outShape = spliceShape({totalElements}, vec[0].jdata());
-    torch::Tensor outJData = torch::empty(outShape, torch::TensorOptions().dtype(vec[0].scalar_type()).device(torch::kCUDA));
-    torch::Tensor outJIdx = torch::empty({totalElements}, torch::TensorOptions().dtype(JIdxScalarType).device(torch::kCUDA));
+    auto          outShape = spliceShape({ totalElements }, vec[0].jdata());
+    torch::Tensor outJData = torch::empty(
+        outShape, torch::TensorOptions().dtype(vec[0].scalar_type()).device(torch::kCUDA));
+    torch::Tensor outJIdx = torch::empty(
+        { totalElements }, torch::TensorOptions().dtype(JIdxScalarType).device(torch::kCUDA));
 
     c10::ScalarType idxType = torch::kInt32;
     if (maxElements < std::numeric_limits<int32_t>::max()) {
@@ -110,16 +129,19 @@ JaggedTensor dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor>& vec) {
     } else if (maxElements < std::numeric_limits<int64_t>::max()) {
         idxType = torch::kInt64;
     } else {
-        TORCH_CHECK(false, "Cannot handle more than ", std::numeric_limits<int64_t>::max(), " elements");
+        TORCH_CHECK(false, "Cannot handle more than ", std::numeric_limits<int64_t>::max(),
+                    " elements");
     }
 
-    torch::Tensor selectIndices = torch::zeros({maxElements}, torch::TensorOptions().dtype(idxType).device(torch::kCUDA));
+    torch::Tensor selectIndices =
+        torch::zeros({ maxElements }, torch::TensorOptions().dtype(idxType).device(torch::kCUDA));
     for (size_t jti = 0; jti < vec.size(); ++jti) {
-        const JaggedTensor& jt = vec[jti];
+        const JaggedTensor &jt = vec[jti];
         AT_DISPATCH_INTEGRAL_TYPES(selectIndices.scalar_type(), "computeIndexPutArg", [&] {
-            const int64_t numElements = jt.jdata().size(0);
+            const int64_t numElements                  = jt.jdata().size(0);
             const int64_t numThreadsComputeIndexPutArg = 1024;
-            const int64_t numBlocksComputeIndexPutArg = GET_BLOCKS(numElements, numThreadsComputeIndexPutArg);
+            const int64_t numBlocksComputeIndexPutArg =
+                GET_BLOCKS(numElements, numThreadsComputeIndexPutArg);
             computeIndexPutArg<<<numBlocksComputeIndexPutArg, numThreadsComputeIndexPutArg>>>(
                 jti, thrust::raw_pointer_cast(offsets_d.data()), offsets_d.size(),
                 jt.jidx().packed_accessor32<JIdxType, 1, torch::RestrictPtrTraits>(),
@@ -129,9 +151,9 @@ JaggedTensor dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor>& vec) {
                 outJIdx.packed_accessor32<JIdxType, 1, torch::RestrictPtrTraits>());
             C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-            torch::Tensor selIdxI = selectIndices.index({torch::indexing::Slice(0, numElements)});
+            torch::Tensor selIdxI = selectIndices.index({ torch::indexing::Slice(0, numElements) });
 
-            outJData.index_put_({selIdxI, torch::indexing::Ellipsis}, jt.jdata());
+            outJData.index_put_({ selIdxI, torch::indexing::Ellipsis }, jt.jdata());
         });
     }
 
@@ -140,27 +162,32 @@ JaggedTensor dispatchJCat0<torch::kCUDA>(const std::vector<JaggedTensor>& vec) {
 }
 
 template <>
-JaggedTensor dispatchJCat0<torch::kCPU>(const std::vector<JaggedTensor>& vec) {
+JaggedTensor
+dispatchJCat0<torch::kCPU>(const std::vector<JaggedTensor> &vec) {
     const auto device = vec[0].device();
-    const auto dtype = vec[0].scalar_type();
+    const auto dtype  = vec[0].scalar_type();
 
     int64_t totalElements = 0;
-    for (const auto& jvec : vec) {
+    for (const auto &jvec: vec) {
         TORCH_CHECK_VALUE(jvec.joffsets().size(0) == vec[0].joffsets().size(0),
-                        "All tensors must have the same number of lists");
+                          "All tensors must have the same number of lists");
         TORCH_CHECK_VALUE(jvec.jdata().dim() == vec[0].jdata().dim(),
-                        "All tensors must have the same number of dimensions");
+                          "All tensors must have the same number of dimensions");
         TORCH_CHECK_VALUE(jvec.device() == device, "All tensors must be on the same device");
-        TORCH_CHECK_VALUE(jvec.scalar_type() == dtype, "All tensors must have the same scalar type");
+        TORCH_CHECK_VALUE(jvec.scalar_type() == dtype,
+                          "All tensors must have the same scalar type");
         totalElements += jvec.jdata().size(0);
     }
-    const auto shape = fvdb::detail::spliceShape({totalElements}, vec[0].jdata());
+    const auto         shape      = fvdb::detail::spliceShape({ totalElements }, vec[0].jdata());
     const JOffsetsType numOffsets = vec[0].joffsets().size(0);
 
-    torch::Tensor outJdata = torch::empty(shape, torch::TensorOptions().device(device).dtype(dtype));
-    torch::Tensor outJoffsets = torch::empty({numOffsets}, torch::TensorOptions().device(device).dtype(JOffsetsScalarType));
-    torch::Tensor outJidx = torch::empty({totalElements}, torch::TensorOptions().device(device).dtype(JIdxScalarType));
-    torch::Tensor outJLidx = vec[0].jlidx();
+    torch::Tensor outJdata =
+        torch::empty(shape, torch::TensorOptions().device(device).dtype(dtype));
+    torch::Tensor outJoffsets = torch::empty(
+        { numOffsets }, torch::TensorOptions().device(device).dtype(JOffsetsScalarType));
+    torch::Tensor outJidx = torch::empty(
+        { totalElements }, torch::TensorOptions().device(device).dtype(JIdxScalarType));
+    torch::Tensor outJLidx         = vec[0].jlidx();
     const int64_t outNumOuterLists = vec[0].num_outer_lists();
 
     JOffsetsType startOffset = 0;
@@ -169,25 +196,28 @@ JaggedTensor dispatchJCat0<torch::kCPU>(const std::vector<JaggedTensor>& vec) {
 
         std::vector<torch::Tensor> tensorsToCat;
         tensorsToCat.reserve(vec.size());
-        for (const auto& jvec : vec) {
+        for (const auto &jvec: vec) {
             const JOffsetsType startIdx = jvec.joffsets()[i].item<JOffsetsType>();
-            const JOffsetsType endIdx = jvec.joffsets()[i+1].item<JOffsetsType>();
-            torch::Tensor jdataSlice = jvec.jdata().index({torch::indexing::Slice(startIdx, endIdx)});
+            const JOffsetsType endIdx   = jvec.joffsets()[i + 1].item<JOffsetsType>();
+            torch::Tensor      jdataSlice =
+                jvec.jdata().index({ torch::indexing::Slice(startIdx, endIdx) });
             tensorsToCat.push_back(jdataSlice);
             numElements += (endIdx - startIdx);
         }
 
-        outJdata.index({torch::indexing::Slice(startOffset, startOffset + numElements)}).copy_(torch::cat(tensorsToCat, 0));
-        outJidx.index({torch::indexing::Slice(startOffset, startOffset + numElements)}).copy_(torch::full({numElements}, i, torch::TensorOptions().dtype(JIdxScalarType).device(device)));
-        outJoffsets[i] = startOffset;
-        outJoffsets[i+1] = startOffset + numElements;
+        outJdata.index({ torch::indexing::Slice(startOffset, startOffset + numElements) })
+            .copy_(torch::cat(tensorsToCat, 0));
+        outJidx.index({ torch::indexing::Slice(startOffset, startOffset + numElements) })
+            .copy_(torch::full({ numElements }, i,
+                               torch::TensorOptions().dtype(JIdxScalarType).device(device)));
+        outJoffsets[i]     = startOffset;
+        outJoffsets[i + 1] = startOffset + numElements;
         startOffset += numElements;
     }
-    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(outJdata, outJoffsets, outJidx, outJLidx, outNumOuterLists);
+    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(outJdata, outJoffsets, outJidx,
+                                                                  outJLidx, outNumOuterLists);
 }
 
-
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/JIdxForGrid.cu b/fvdb/src/detail/ops/JIdxForGrid.cu
index ecf7d45eb9..15a5c51e18 100644
--- a/fvdb/src/detail/ops/JIdxForGrid.cu
+++ b/fvdb/src/detail/ops/JIdxForGrid.cu
@@ -1,10 +1,10 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
@@ -12,53 +12,64 @@ namespace ops {
 
 /// @brief Per-voxel callback for getting the batch index of each enabled voxel in a batch of grids
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void enabledGridCoordsVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
-                                                        GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                        TorchAccessor<int64_t, 1> leafBaseOffset,
-                                                        TorchAccessor<fvdb::JIdxType, 1> outJIdx) {
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+__hostdev__ inline void
+enabledGridCoordsVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                               GridBatchImpl::Accessor<GridType> gridAccessor,
+                               TorchAccessor<int64_t, 1>         leafBaseOffset,
+                               TorchAccessor<fvdb::JIdxType, 1>  outJIdx) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     const nanovdb::Coord ijk = leaf.offsetToGlobalCoord(voxelIdx);
-    const int64_t outIdx = leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
+    const int64_t        outIdx =
+        leafBaseOffset[leafIdx] + leaf.template get<UnmaskedPerLeaf<GridType>>(voxelIdx);
     if (leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx)) {
         outJIdx[outIdx] = batchIdx;
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-inline torch::Tensor EnabledVoxelsJIdx(const GridBatchImpl& gridBatch) {
+inline torch::Tensor
+EnabledVoxelsJIdx(const GridBatchImpl &gridBatch) {
     using GridType = nanovdb::ValueOnIndexMask;
 
     // Compute a prefix sum of the unmasked voxels per leaf
-    const torch::Tensor leafBaseOffset = countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
+    const torch::Tensor leafBaseOffset =
+        countEnabledPerLeafShiftedByOne<GridType, DeviceTag>(gridBatch).cumsum(0, torch::kInt64);
     const int64_t numEnabledCoords = leafBaseOffset[-1].item<int64_t>();
 
-    torch::Tensor outJIdx = torch::empty({numEnabledCoords}, torch::dtype(fvdb::JIdxScalarType).device(gridBatch.device()));
+    torch::Tensor outJIdx = torch::empty(
+        { numEnabledCoords }, torch::dtype(fvdb::JIdxScalarType).device(gridBatch.device()));
 
     // Get the enabled grid batch indices
     auto leafBaseOffsetAcc = tensorAccessor<DeviceTag, int64_t, 1>(leafBaseOffset);
-    auto outJIdxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
+    auto outJIdxAcc        = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridCoordsVoxelCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outJIdxAcc);
+        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                                 GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridCoordsVoxelCallback<GridType, TorchRAcc32>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outJIdxAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t, GridBatchImpl::Accessor<GridType> gridAccessor) {
-            enabledGridCoordsVoxelCallback<GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outJIdxAcc);
+        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t,
+                      GridBatchImpl::Accessor<GridType> gridAccessor) {
+            enabledGridCoordsVoxelCallback<GridType, TorchAcc>(
+                batchIdx, leafIdx, voxelIdx, gridAccessor, leafBaseOffsetAcc, outJIdxAcc);
         };
         forEachVoxelCPU<GridType>(1, gridBatch, cb);
     }
     return outJIdx;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor GridJIdx(const GridBatchImpl& gridBatch, bool ignoreDisabledVoxels) {
+torch::Tensor
+GridJIdx(const GridBatchImpl &gridBatch, bool ignoreDisabledVoxels) {
     return FVDB_DISPATCH_GRID_TYPES(gridBatch, [&]() {
-        if (ignoreDisabledVoxels || nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
-            return ops::dispatchJIdxForJOffsets<DeviceTag>(gridBatch.voxelOffsets(ignoreDisabledVoxels), gridBatch.totalVoxels());
+        if (ignoreDisabledVoxels ||
+            nanovdb::util::is_same<GridType, nanovdb::ValueOnIndex>::value) {
+            return ops::dispatchJIdxForJOffsets<DeviceTag>(
+                gridBatch.voxelOffsets(ignoreDisabledVoxels), gridBatch.totalVoxels());
         } else if (nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value) {
             TORCH_CHECK(!ignoreDisabledVoxels, "This should never happen");
             return EnabledVoxelsJIdx<DeviceTag>(gridBatch);
@@ -66,15 +77,15 @@ torch::Tensor GridJIdx(const GridBatchImpl& gridBatch, bool ignoreDisabledVoxels
     });
 }
 
-
-
 template <>
-torch::Tensor dispatchJIdxForGrid<torch::kCPU>(const GridBatchImpl& gridBatch, bool ignoreDisabledVoxels) {
+torch::Tensor
+dispatchJIdxForGrid<torch::kCPU>(const GridBatchImpl &gridBatch, bool ignoreDisabledVoxels) {
     return GridJIdx<torch::kCPU>(gridBatch, ignoreDisabledVoxels);
 }
 
 template <>
-torch::Tensor dispatchJIdxForGrid<torch::kCUDA>(const GridBatchImpl& gridBatch, bool ignoreDisabledVoxels) {
+torch::Tensor
+dispatchJIdxForGrid<torch::kCUDA>(const GridBatchImpl &gridBatch, bool ignoreDisabledVoxels) {
     return GridJIdx<torch::kCUDA>(gridBatch, ignoreDisabledVoxels);
 }
 
diff --git a/fvdb/src/detail/ops/JIdxForJOffsets.cu b/fvdb/src/detail/ops/JIdxForJOffsets.cu
index 23155d3421..4b97d599b6 100644
--- a/fvdb/src/detail/ops/JIdxForJOffsets.cu
+++ b/fvdb/src/detail/ops/JIdxForJOffsets.cu
@@ -1,17 +1,17 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-__global__ void jIdxForJOffsets(TorchRAcc32<fvdb::JOffsetsType, 1> offsets,
-                                TorchRAcc32<fvdb::JIdxType, 1> outJIdx) {
-
+__global__ void
+jIdxForJOffsets(TorchRAcc32<fvdb::JOffsetsType, 1> offsets,
+                TorchRAcc32<fvdb::JIdxType, 1>     outJIdx) {
     const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     if (idx >= outJIdx.size(0)) {
@@ -41,38 +41,48 @@ __global__ void jIdxForJOffsets(TorchRAcc32<fvdb::JOffsetsType, 1> offsets,
     outJIdx[idx] = -1;
 }
 
-
 template <>
-torch::Tensor dispatchJIdxForJOffsets<torch::kCUDA>(torch::Tensor joffsets, int64_t numElements) {
-    TORCH_CHECK(numElements >= 0, "Cannot call dispatchJIDxForOffsets with negative number of elements");
+torch::Tensor
+dispatchJIdxForJOffsets<torch::kCUDA>(torch::Tensor joffsets, int64_t numElements) {
+    TORCH_CHECK(numElements >= 0,
+                "Cannot call dispatchJIDxForOffsets with negative number of elements");
 
     if (numElements == 0) {
-        return torch::zeros({0}, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
+        return torch::zeros(
+            { 0 }, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
     }
-    torch::Tensor retJIdx = torch::empty({numElements}, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
+    torch::Tensor retJIdx =
+        torch::empty({ numElements },
+                     torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
 
     const int blockSize = 1024;
-    const int gridSize = (numElements + blockSize - 1) / blockSize;
-    jIdxForJOffsets<<<gridSize, blockSize>>>(joffsets.packed_accessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>(),
-                                             retJIdx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>());
+    const int gridSize  = (numElements + blockSize - 1) / blockSize;
+    jIdxForJOffsets<<<gridSize, blockSize>>>(
+        joffsets.packed_accessor32<fvdb::JOffsetsType, 1, torch::RestrictPtrTraits>(),
+        retJIdx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
     return retJIdx;
 }
 
 template <>
-torch::Tensor dispatchJIdxForJOffsets<torch::kCPU>(torch::Tensor joffsets, int64_t numElements) {
-    TORCH_CHECK(numElements >= 0, "Cannot call dispatchJIDxForOffsets with negaive number of elements");
+torch::Tensor
+dispatchJIdxForJOffsets<torch::kCPU>(torch::Tensor joffsets, int64_t numElements) {
+    TORCH_CHECK(numElements >= 0,
+                "Cannot call dispatchJIDxForOffsets with negaive number of elements");
     if (numElements == 0) {
-        return torch::zeros({0}, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
+        return torch::zeros(
+            { 0 }, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device()));
     }
     std::vector<torch::Tensor> batchIdxs;
     batchIdxs.reserve(joffsets.size(0));
     for (int i = 0; i < joffsets.size(0) - 1; i += 1) {
-        batchIdxs.push_back(torch::full({joffsets[i+1].item<fvdb::JOffsetsType>() - joffsets[i].item<fvdb::JOffsetsType>()}, i, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device())));
+        batchIdxs.push_back(torch::full(
+            { joffsets[i + 1].item<fvdb::JOffsetsType>() - joffsets[i].item<fvdb::JOffsetsType>() },
+            i, torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(joffsets.device())));
     }
     return torch::cat(batchIdxs, 0);
 }
 
-}  // namespace ops
-}  // namespace detail
-}  // namespace fvdb
\ No newline at end of file
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/JOffsetsFromJIdx.cu b/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
index 557bf57837..a06c1fd842 100644
--- a/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
+++ b/fvdb/src/detail/ops/JOffsetsFromJIdx.cu
@@ -1,75 +1,79 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include "detail/ops/Ops.h"
-#include "detail/utils/cuda/Utils.cuh"
+#include "Ops.h"
 
-#include <cub/cub.cuh>
-#include <c10/cuda/CUDACachingAllocator.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <cub/cub.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 template <typename T>
-__global__ void setZero(T* thingToSet) {
+__global__ void
+setZero(T *thingToSet) {
     *thingToSet = 0;
 }
 
 template <>
-torch::Tensor dispatchJOffsetsForJIdx<torch::kCUDA>(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors) {
+torch::Tensor
+dispatchJOffsetsForJIdx<torch::kCUDA>(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors) {
     TORCH_CHECK_VALUE(jdata.device().is_cuda(), "Invalid device for jdata");
     c10::cuda::CUDAGuard deviceGuard(jdata.device());
 
     if (jidx.size(0) == 0 && numTensors == 1) {
-        torch::Tensor ret = torch::empty({2}, JOffsetsScalarType);
-        auto acc = ret.accessor<JOffsetsType, 1>();
-        acc[0] = 0;
-        acc[1] = jdata.size(0);
+        torch::Tensor ret = torch::empty({ 2 }, JOffsetsScalarType);
+        auto          acc = ret.accessor<JOffsetsType, 1>();
+        acc[0]            = 0;
+        acc[1]            = jdata.size(0);
         return ret.to(jdata.device());
     }
 
     TORCH_CHECK_VALUE(jidx.device().is_cuda(), "Invalid device for jidx");
-    TORCH_CHECK_VALUE(jidx.scalar_type() == JIdxScalarType, "Invalid scalar type for jidx. Got ", jidx.scalar_type(), " but expected ", JIdxScalarType);
+    TORCH_CHECK_VALUE(jidx.scalar_type() == JIdxScalarType, "Invalid scalar type for jidx. Got ",
+                      jidx.scalar_type(), " but expected ", JIdxScalarType);
     TORCH_CHECK_VALUE(jidx.is_contiguous(), "jidx must be contiguous");
-    TORCH_CHECK_VALUE(jidx.size(0) == jdata.size(0), "jidx and jdata must have the same number of elments");
+    TORCH_CHECK_VALUE(jidx.size(0) == jdata.size(0),
+                      "jidx and jdata must have the same number of elments");
 
     const size_t numItems = jidx.size(0);
 
     // FIXME: Francis -- write a dummy output iterator so we don't actually allocate here.
-    torch::Tensor dummyOut = torch::empty({numTensors}, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
+    torch::Tensor dummyOut = torch::empty(
+        { numTensors }, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
 
-    torch::Tensor joffsetsOut = torch::empty({numTensors + 1}, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
-    torch::Tensor numRunsOut = torch::empty({1}, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
+    torch::Tensor joffsetsOut = torch::empty(
+        { numTensors + 1 }, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
+    torch::Tensor numRunsOut =
+        torch::empty({ 1 }, torch::TensorOptions().dtype(JIdxScalarType).device(jdata.device()));
 
     // Get current cuda stream for device
     at::cuda::CUDAStream currentStream = at::cuda::getCurrentCUDAStream(jdata.device().index());
 
     // Determine temporary device storage requirements
-    void *d_temp_storage = nullptr;
+    void  *d_temp_storage     = nullptr;
     size_t temp_storage_bytes = 0;
-    cub::DeviceRunLengthEncode::Encode(
-        d_temp_storage, temp_storage_bytes,
-        jidx.data_ptr<JIdxType>(), // keys in
-        dummyOut.data_ptr<JIdxType>(), // unique out (dummy)
-        joffsetsOut.data_ptr<JIdxType>() + 1, // counts out
-        numRunsOut.data_ptr<JIdxType>(), // num runs out
-        numItems, currentStream.stream());
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes,
+                                       jidx.data_ptr<JIdxType>(),            // keys in
+                                       dummyOut.data_ptr<JIdxType>(),        // unique out (dummy)
+                                       joffsetsOut.data_ptr<JIdxType>() + 1, // counts out
+                                       numRunsOut.data_ptr<JIdxType>(),      // num runs out
+                                       numItems, currentStream.stream());
 
     // Allocate temporary storage
-    d_temp_storage = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(temp_storage_bytes, currentStream.stream());
+    d_temp_storage = c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(temp_storage_bytes,
+                                                                            currentStream.stream());
 
     // Do the actual reduce by key operation
-    cub::DeviceRunLengthEncode::Encode(
-        d_temp_storage, temp_storage_bytes,
-        jidx.data_ptr<JIdxType>(), // keys in
-        dummyOut.data_ptr<JIdxType>(), // values out (dummy)
-        joffsetsOut.data_ptr<JIdxType>() + 1, // unique out
-        numRunsOut.data_ptr<JIdxType>(), // num runs out
-        numItems, currentStream.stream());
-
+    cub::DeviceRunLengthEncode::Encode(d_temp_storage, temp_storage_bytes,
+                                       jidx.data_ptr<JIdxType>(),            // keys in
+                                       dummyOut.data_ptr<JIdxType>(),        // values out (dummy)
+                                       joffsetsOut.data_ptr<JIdxType>() + 1, // unique out
+                                       numRunsOut.data_ptr<JIdxType>(),      // num runs out
+                                       numItems, currentStream.stream());
 
     // Free up scratch memory
     c10::cuda::CUDACachingAllocator::raw_delete(d_temp_storage);
@@ -82,16 +86,16 @@ torch::Tensor dispatchJOffsetsForJIdx<torch::kCUDA>(torch::Tensor jidx, torch::T
 }
 
 template <>
-torch::Tensor dispatchJOffsetsForJIdx<torch::kCPU>(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors) {
-
+torch::Tensor
+dispatchJOffsetsForJIdx<torch::kCPU>(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors) {
     TORCH_CHECK_VALUE(jidx.dim() == 1, "jidx must be a 1D tensor");
     TORCH_CHECK_VALUE(jdata.device().is_cpu(), "Invalid device for jdata");
 
     if (jidx.size(0) == 0 && numTensors == 1) {
-        torch::Tensor ret = torch::empty({2}, JOffsetsScalarType);
-        auto acc = ret.accessor<JOffsetsType, 1>();
-        acc[0] = 0;
-        acc[1] = jdata.size(0);
+        torch::Tensor ret = torch::empty({ 2 }, JOffsetsScalarType);
+        auto          acc = ret.accessor<JOffsetsType, 1>();
+        acc[0]            = 0;
+        acc[1]            = jdata.size(0);
         return ret.to(jdata.device());
     }
 
@@ -99,13 +103,16 @@ torch::Tensor dispatchJOffsetsForJIdx<torch::kCPU>(torch::Tensor jidx, torch::Te
 
     // Get the number of unique batch indices assuming jidx is always sorted
     // It should be of the form [0, ..., 0, 1, ..., 1, 3, ..., 3, ...]
-    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> uniqueRes = torch::unique_dim(jidx, 0, false, false, true);
-    torch::Tensor uniqueBatchValues = std::get<0>(uniqueRes);  // [0, 1, 3, ...]
-    torch::Tensor uniqueBatchCounts = std::get<2>(uniqueRes);  // [n0, n1, n3, ...]
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> uniqueRes =
+        torch::unique_dim(jidx, 0, false, false, true);
+    torch::Tensor uniqueBatchValues = std::get<0>(uniqueRes); // [0, 1, 3, ...]
+    torch::Tensor uniqueBatchCounts = std::get<2>(uniqueRes); // [n0, n1, n3, ...]
 
-    torch::Tensor fullBatchCounts = torch::full(
-            {numTensors + 1}, 0, torch::TensorOptions().dtype(JOffsetsScalarType).device(jdata.device()));
-    fullBatchCounts.index({torch::indexing::Slice(1, torch::indexing::None, 1)}).index_put_({uniqueBatchValues.to(torch::kLong)}, uniqueBatchCounts);
+    torch::Tensor fullBatchCounts =
+        torch::full({ numTensors + 1 }, 0,
+                    torch::TensorOptions().dtype(JOffsetsScalarType).device(jdata.device()));
+    fullBatchCounts.index({ torch::indexing::Slice(1, torch::indexing::None, 1) })
+        .index_put_({ uniqueBatchValues.to(torch::kLong) }, uniqueBatchCounts);
 
     torch::Tensor cumOffsets = torch::cumsum(fullBatchCounts, 0, JOffsetsScalarType);
     return cumOffsets;
@@ -114,4 +121,3 @@ torch::Tensor dispatchJOffsetsForJIdx<torch::kCPU>(torch::Tensor jidx, torch::Te
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/JaggedTensorIndex.cu b/fvdb/src/detail/ops/JaggedTensorIndex.cu
index f50d100bfe..5262f596f3 100644
--- a/fvdb/src/detail/ops/JaggedTensorIndex.cu
+++ b/fvdb/src/detail/ops/JaggedTensorIndex.cu
@@ -1,21 +1,21 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include "Ops.h"
 
+#include <detail/utils/Utils.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 #include <ATen/cuda/Atomic.cuh>
 
-#include "detail/ops/Ops.h"
-#include "detail/utils/Utils.h"
-#include "detail/utils/cuda/Utils.cuh"
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-// __global__ void makeJOffsetsForListJt(const TorchRAcc32<JOffsetsType, 1> inJoffsets, int64_t idxVal,
+// __global__ void makeJOffsetsForListJt(const TorchRAcc32<JOffsetsType, 1> inJoffsets, int64_t
+// idxVal,
 //                                       TorchRAcc32<JOffsetsType, 1> outJoffsets) {
 //     JOffsetsType startIdx = inJoffsets[idxVal];
 //     JOffsetsType endIdx = inJoffsets[idxVal + 1];
@@ -23,25 +23,24 @@ namespace ops {
 //     outJoffsets[1] = endIdx - startIdx;
 // }
 
-
-__global__ void getJOffsetsMask(const int64_t idxVal,
-                                const TorchRAcc32<JLIdxType, 2> jlidx,
-                                const TorchRAcc32<JOffsetsType, 1> inJoffsets,
-                                TorchRAcc32<JOffsetsType, 1> offsetsAndRange) {
+__global__ void
+getJOffsetsMask(const int64_t idxVal, const TorchRAcc32<JLIdxType, 2> jlidx,
+                const TorchRAcc32<JOffsetsType, 1> inJoffsets,
+                TorchRAcc32<JOffsetsType, 1>       offsetsAndRange) {
     int32_t idx = threadIdx.x + blockIdx.x * blockDim.x;
 
     if (idx >= jlidx.size(0)) {
         return;
     }
 
-    JLIdxType lid = jlidx[idx][0];
+    JLIdxType lid     = jlidx[idx][0];
     JLIdxType prevLid = -1;
     if (idx - 1 >= 0) {
         prevLid = jlidx[idx - 1][0];
     }
-    const bool lidMatches = lid == idxVal;
+    const bool lidMatches     = lid == idxVal;
     const bool prevLidMatches = prevLid == idxVal;
-    const bool isLastIdx = idx == (jlidx.size(0) - 1);
+    const bool isLastIdx      = idx == (jlidx.size(0) - 1);
 
     if (lidMatches && !prevLidMatches) {
         offsetsAndRange[0] = inJoffsets[idx];
@@ -57,7 +56,6 @@ __global__ void getJOffsetsMask(const int64_t idxVal,
     }
 }
 
-
 // __global__ void computeJLidx(const int64_t startIdx, const int64_t idxVal,
 //                              const TorchRAcc32<JLIdxType, 2> inJLIdx,
 //                              TorchRAcc32<JLIdxType, 2> outJLidx) {
@@ -70,95 +68,101 @@ __global__ void getJOffsetsMask(const int64_t idxVal,
 //     outJLidx[idx][1] = inJLIdx[idx + startIdx][1];
 // }
 
-
-JaggedTensor jaggedTensorIndexMultiListCuda(const JaggedTensor& jt, int64_t idxVal) {
+JaggedTensor
+jaggedTensorIndexMultiListCuda(const JaggedTensor &jt, int64_t idxVal) {
     if (idxVal < 0) {
         idxVal += jt.num_outer_lists();
     }
-    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(),
-                      "Index ", idxVal, " is out of bounds for JaggedTensor with ",
-                      jt.num_outer_lists(), " elements");
+    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(), "Index ", idxVal,
+                      " is out of bounds for JaggedTensor with ", jt.num_outer_lists(),
+                      " elements");
 
     torch::Tensor joffsets = jt.joffsets();
-    torch::Tensor jdata = jt.jdata();
-    torch::Tensor jlidx = jt.jlidx();
+    torch::Tensor jdata    = jt.jdata();
+    torch::Tensor jlidx    = jt.jlidx();
 
     TORCH_CHECK_VALUE(jlidx.dim() == 2, "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(jlidx.numel() == 0 || jlidx.size(0) == (joffsets.size(0) - 1), "Corrupt list indices. This should never happen");
-
-    torch::Tensor offsetsAndRange = torch::empty({4}, torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU).pinned_memory(true));
-    offsetsAndRange = offsetsAndRange.to(jt.device());
-    auto inJLidxAcc = jlidx.packed_accessor32<JLIdxType, 2, torch::RestrictPtrTraits>();
+    TORCH_CHECK_VALUE(jlidx.numel() == 0 || jlidx.size(0) == (joffsets.size(0) - 1),
+                      "Corrupt list indices. This should never happen");
+
+    torch::Tensor offsetsAndRange = torch::empty(
+        { 4 },
+        torch::TensorOptions().dtype(JOffsetsScalarType).device(torch::kCPU).pinned_memory(true));
+    offsetsAndRange    = offsetsAndRange.to(jt.device());
+    auto inJLidxAcc    = jlidx.packed_accessor32<JLIdxType, 2, torch::RestrictPtrTraits>();
     auto inJOffsetsAcc = joffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>();
-    auto offsetsAndRangeAcc = offsetsAndRange.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>();
+    auto offsetsAndRangeAcc =
+        offsetsAndRange.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>();
     const int numBlocks = GET_BLOCKS(joffsets.size(0), 1024);
     getJOffsetsMask<<<numBlocks, 1024>>>(idxVal, inJLidxAcc, inJOffsetsAcc, offsetsAndRangeAcc);
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
-    offsetsAndRange = offsetsAndRange.cpu();
+    offsetsAndRange                       = offsetsAndRange.cpu();
     const JOffsetsType elementStartOffset = offsetsAndRange[0].item<JOffsetsType>();
-    const JOffsetsType elementEndOffset = offsetsAndRange[1].item<JOffsetsType>();
-    const JOffsetsType startIdx = offsetsAndRange[2].item<JOffsetsType>();
-    const JOffsetsType endIdx = offsetsAndRange[3].item<JOffsetsType>();
-    torch::Tensor retOffsets = joffsets.index({torch::indexing::Slice(startIdx, endIdx+1)}) - elementStartOffset;
-    const torch::Tensor retData = jdata.index({torch::indexing::Slice(elementStartOffset, elementEndOffset)});
+    const JOffsetsType elementEndOffset   = offsetsAndRange[1].item<JOffsetsType>();
+    const JOffsetsType startIdx           = offsetsAndRange[2].item<JOffsetsType>();
+    const JOffsetsType endIdx             = offsetsAndRange[3].item<JOffsetsType>();
+    torch::Tensor      retOffsets =
+        joffsets.index({ torch::indexing::Slice(startIdx, endIdx + 1) }) - elementStartOffset;
+    const torch::Tensor retData =
+        jdata.index({ torch::indexing::Slice(elementStartOffset, elementEndOffset) });
 
     torch::Tensor retListIdx;
-    int64_t retNumOuterLists;
+    int64_t       retNumOuterLists;
     if (jlidx.size(1) > 1 && jlidx.size(1) > 2) {
         TORCH_CHECK(false, "We don't support ldim > 2.");
-        // const auto lidxOpts = torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device());
-        // retListIdx = torch::empty({retOffsets.size(0)-1, 2}, lidxOpts);
-        // auto outJLidxAcc = retListIdx.packed_accessor32<JLIdxType, 2, torch::RestrictPtrTraits>();
-        // const int numBlocksJLidx = GET_BLOCKS(retListIdx.size(0), 1024);
-        // computeJLidx<<<numBlocksJLidx, 1024>>>(startIdx, idxVal, inJLidxAcc, outJLidxAcc);
-        // C10_CUDA_KERNEL_LAUNCH_CHECK();
+        // const auto lidxOpts =
+        // torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device()); retListIdx =
+        // torch::empty({retOffsets.size(0)-1, 2}, lidxOpts); auto outJLidxAcc =
+        // retListIdx.packed_accessor32<JLIdxType, 2, torch::RestrictPtrTraits>(); const int
+        // numBlocksJLidx = GET_BLOCKS(retListIdx.size(0), 1024); computeJLidx<<<numBlocksJLidx,
+        // 1024>>>(startIdx, idxVal, inJLidxAcc, outJLidxAcc); C10_CUDA_KERNEL_LAUNCH_CHECK();
         // retNumOuterLists = std::get<0>(torch::unique_dim(retListIdx, 0)).size(0);
     } else {
-        retListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device()));
+        retListIdx = torch::empty(
+            { 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device()));
         retNumOuterLists = retOffsets.size(0) - 1;
     }
 
     const torch::Tensor retJidx = JaggedTensor::jidx_from_joffsets(retOffsets, retData.size(0));
-    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx, retListIdx, retNumOuterLists);
+    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx,
+                                                                  retListIdx, retNumOuterLists);
 }
 
-
-
-
-
-JaggedTensor jaggedTensorIndexMultiListCpu(const JaggedTensor& jt, int64_t idxVal) {
+JaggedTensor
+jaggedTensorIndexMultiListCpu(const JaggedTensor &jt, int64_t idxVal) {
     if (idxVal < 0) {
         idxVal += jt.num_outer_lists();
     }
-    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(),
-                      "Index ", idxVal, " is out of bounds for JaggedTensor with ",
-                      jt.num_outer_lists(), " elements");
+    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(), "Index ", idxVal,
+                      " is out of bounds for JaggedTensor with ", jt.num_outer_lists(),
+                      " elements");
 
     torch::Tensor joffsets = jt.joffsets();
-    torch::Tensor jdata = jt.jdata();
-    torch::Tensor jlidx = jt.jlidx();
+    torch::Tensor jdata    = jt.jdata();
+    torch::Tensor jlidx    = jt.jlidx();
 
     TORCH_CHECK_VALUE(jlidx.dim() == 2, "Corrupt list indices. This should never happen");
-    TORCH_CHECK_VALUE(jlidx.numel() == 0 || jlidx.size(0) == (joffsets.size(0) - 1), "Corrupt list indices. This should never happen");
-    const torch::Tensor joffsetCat = torch::stack({
-        joffsets.index({torch::indexing::Slice(0, jt.num_tensors())}),
-        joffsets.index({torch::indexing::Slice(1, jt.num_tensors()+1)})
-    }, 1);
-    const torch::Tensor mask = jlidx.index({torch::indexing::Slice(), 0}).eq(idxVal);
-    const torch::Tensor selectedOffsets = joffsetCat.index({mask});
+    TORCH_CHECK_VALUE(jlidx.numel() == 0 || jlidx.size(0) == (joffsets.size(0) - 1),
+                      "Corrupt list indices. This should never happen");
+    const torch::Tensor joffsetCat =
+        torch::stack({ joffsets.index({ torch::indexing::Slice(0, jt.num_tensors()) }),
+                       joffsets.index({ torch::indexing::Slice(1, jt.num_tensors() + 1) }) },
+                     1);
+    const torch::Tensor mask            = jlidx.index({ torch::indexing::Slice(), 0 }).eq(idxVal);
+    const torch::Tensor selectedOffsets = joffsetCat.index({ mask });
 
     const JOffsetsType startIdx = selectedOffsets[0][0].item<JOffsetsType>();
-    const JOffsetsType endIdx = selectedOffsets[-1][1].item<JOffsetsType>();
+    const JOffsetsType endIdx   = selectedOffsets[-1][1].item<JOffsetsType>();
 
-    const torch::Tensor retData = jdata.index({torch::indexing::Slice(startIdx, endIdx)});
+    const torch::Tensor retData = jdata.index({ torch::indexing::Slice(startIdx, endIdx) });
 
-    const torch::Tensor retOffsets = torch::cat({
-        selectedOffsets.index({torch::indexing::Slice(), 0}),
-        selectedOffsets.index({-1, 1}).unsqueeze(0)
-    }) - startIdx;
+    const torch::Tensor retOffsets =
+        torch::cat({ selectedOffsets.index({ torch::indexing::Slice(), 0 }),
+                     selectedOffsets.index({ -1, 1 }).unsqueeze(0) }) -
+        startIdx;
     torch::Tensor retListIdx;
-    int64_t retNumOuterLists;
+    int64_t       retNumOuterLists;
     if (jlidx.size(1) > 1 && jlidx.size(1) > 2) {
         TORCH_CHECK(false, "We don't support ldim > 2.");
         // retListIdx = jlidx.index({mask, torch::indexing::Slice(1, jlidx.size(1))});
@@ -167,41 +171,44 @@ JaggedTensor jaggedTensorIndexMultiListCpu(const JaggedTensor& jt, int64_t idxVa
         // }
         // retNumOuterLists = std::get<0>(torch::unique_dim(retListIdx, 0)).size(0);
     } else {
-        retListIdx = torch::empty({0, 1}, torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device()));
+        retListIdx = torch::empty(
+            { 0, 1 }, torch::TensorOptions().dtype(JLIdxScalarType).device(jdata.device()));
         retNumOuterLists = retOffsets.size(0) - 1;
     }
 
     const torch::Tensor retJidx = JaggedTensor::jidx_from_joffsets(retOffsets, retData.size(0));
-    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx, retListIdx, retNumOuterLists);
+    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retOffsets, retJidx,
+                                                                  retListIdx, retNumOuterLists);
 }
 
-
-JaggedTensor jaggedTensorIndexOneList(const JaggedTensor& jt, int64_t idxVal) {
+JaggedTensor
+jaggedTensorIndexOneList(const JaggedTensor &jt, int64_t idxVal) {
     if (idxVal < 0) {
         idxVal += jt.num_outer_lists();
     }
-    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(),
-                      "Index ", idxVal, " is out of bounds for JaggedTensor with ",
-                      jt.num_outer_lists(), " elements");
+    TORCH_CHECK_INDEX(idxVal >= 0 && idxVal < jt.num_outer_lists(), "Index ", idxVal,
+                      " is out of bounds for JaggedTensor with ", jt.num_outer_lists(),
+                      " elements");
 
     torch::Tensor joffsets = jt.joffsets();
-    torch::Tensor jdata = jt.jdata();
-    torch::Tensor jlidx = jt.jlidx();
+    torch::Tensor jdata    = jt.jdata();
+    torch::Tensor jlidx    = jt.jlidx();
 
     TORCH_CHECK(jt.ldim() == 1, "bad list indexes. this should never happen");
-    const JOffsetsType startIdx = joffsets[idxVal].item<JOffsetsType>();
-    const JOffsetsType endIdx = joffsets[idxVal+1].item<JOffsetsType>();
-    const torch::Tensor retJoffsets = torch::tensor({JOffsetsType(0), endIdx - startIdx}, torch::TensorOptions().dtype(JOffsetsScalarType).device(jdata.device()));
-    const torch::Tensor retData = jdata.index({torch::indexing::Slice(startIdx, endIdx)});
-    const torch::Tensor retJidx = torch::empty({0}, torch::TensorOptions().dtype(JIdxScalarType));
-    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-        retData, retJoffsets, retJidx, jlidx, retJoffsets.size(0) - 1);
+    const JOffsetsType  startIdx = joffsets[idxVal].item<JOffsetsType>();
+    const JOffsetsType  endIdx   = joffsets[idxVal + 1].item<JOffsetsType>();
+    const torch::Tensor retJoffsets =
+        torch::tensor({ JOffsetsType(0), endIdx - startIdx },
+                      torch::TensorOptions().dtype(JOffsetsScalarType).device(jdata.device()));
+    const torch::Tensor retData = jdata.index({ torch::indexing::Slice(startIdx, endIdx) });
+    const torch::Tensor retJidx = torch::empty({ 0 }, torch::TensorOptions().dtype(JIdxScalarType));
+    return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(retData, retJoffsets, retJidx,
+                                                                  jlidx, retJoffsets.size(0) - 1);
 }
 
-
-
 template <>
-JaggedTensor dispatchJaggedTensorIndex<torch::kCPU>(const JaggedTensor& jt, int64_t idxVal) {
+JaggedTensor
+dispatchJaggedTensorIndex<torch::kCPU>(const JaggedTensor &jt, int64_t idxVal) {
     if (jt.jlidx().size(0) == 0) {
         return jaggedTensorIndexOneList(jt, idxVal);
     } else {
@@ -209,9 +216,9 @@ JaggedTensor dispatchJaggedTensorIndex<torch::kCPU>(const JaggedTensor& jt, int6
     }
 }
 
-
 template <>
-JaggedTensor dispatchJaggedTensorIndex<torch::kCUDA>(const JaggedTensor& jt, int64_t idxVal) {
+JaggedTensor
+dispatchJaggedTensorIndex<torch::kCUDA>(const JaggedTensor &jt, int64_t idxVal) {
     if (jt.jlidx().size(0) == 0) {
         return jaggedTensorIndexOneList(jt, idxVal);
     } else {
@@ -219,8 +226,6 @@ JaggedTensor dispatchJaggedTensorIndex<torch::kCUDA>(const JaggedTensor& jt, int
     }
 }
 
-
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/MarchingCubes.cu b/fvdb/src/detail/ops/MarchingCubes.cu
index c86eb32854..e305a523b6 100644
--- a/fvdb/src/detail/ops/MarchingCubes.cu
+++ b/fvdb/src/detail/ops/MarchingCubes.cu
@@ -1,34 +1,43 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/MarchingCubesData.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/MarchingCubesData.h"
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename ScalarT>
-__hostdev__ static inline int getCubeType(const ScalarT* sdfValues) {
+__hostdev__ static inline int
+getCubeType(const ScalarT *sdfValues) {
     int cubeType = 0;
-    if (sdfValues[0] < 0) cubeType |= 1;
-    if (sdfValues[1] < 0) cubeType |= 2;
-    if (sdfValues[2] < 0) cubeType |= 4;
-    if (sdfValues[3] < 0) cubeType |= 8;
-    if (sdfValues[4] < 0) cubeType |= 16;
-    if (sdfValues[5] < 0) cubeType |= 32;
-    if (sdfValues[6] < 0) cubeType |= 64;
-    if (sdfValues[7] < 0) cubeType |= 128;
+    if (sdfValues[0] < 0)
+        cubeType |= 1;
+    if (sdfValues[1] < 0)
+        cubeType |= 2;
+    if (sdfValues[2] < 0)
+        cubeType |= 4;
+    if (sdfValues[3] < 0)
+        cubeType |= 8;
+    if (sdfValues[4] < 0)
+        cubeType |= 16;
+    if (sdfValues[5] < 0)
+        cubeType |= 32;
+    if (sdfValues[6] < 0)
+        cubeType |= 64;
+    if (sdfValues[7] < 0)
+        cubeType |= 128;
     return cubeType;
 }
 
-template<typename ScalarT>
-__hostdev__ static inline nanovdb::math::Vec4<ScalarT> sdfInterp(
-        const nanovdb::math::Vec3<ScalarT> p1, const nanovdb::math::Vec3<ScalarT> p2, ScalarT valp1, ScalarT valp2) {
-
+template <typename ScalarT>
+__hostdev__ static inline nanovdb::math::Vec4<ScalarT>
+sdfInterp(const nanovdb::math::Vec3<ScalarT> p1, const nanovdb::math::Vec3<ScalarT> p2,
+          ScalarT valp1, ScalarT valp2) {
     if (std::abs(0.0f - valp1) < 1.0e-5f)
         return nanovdb::math::Vec4<ScalarT>(p1[0], p1[1], p1[2], 1.0);
 
@@ -41,99 +50,114 @@ __hostdev__ static inline nanovdb::math::Vec4<ScalarT> sdfInterp(
     ScalarT w2 = (0.0 - valp1) / (valp2 - valp1);
     ScalarT w1 = 1 - w2;
 
-    return nanovdb::math::Vec4<ScalarT>(
-            p1[0] * w1 + p2[0] * w2,
-            p1[1] * w1 + p2[1] * w2,
-            p1[2] * w1 + p2[2] * w2, w1);
+    return nanovdb::math::Vec4<ScalarT>(p1[0] * w1 + p2[0] * w2, p1[1] * w1 + p2[1] * w2,
+                                        p1[2] * w1 + p2[2] * w2, w1);
 }
 
 template <typename ScalarT>
-__hostdev__ static inline void fillVertList(nanovdb::math::Vec4<ScalarT>* vert_list, int edge_config,
-                                            nanovdb::math::Vec3<ScalarT>* points, ScalarT* sdf_vals) {
-    if (edge_config & 1) vert_list[0] = sdfInterp(points[0], points[1], sdf_vals[0], sdf_vals[1]);
-    if (edge_config & 2) vert_list[1] = sdfInterp(points[1], points[2], sdf_vals[1], sdf_vals[2]);
-    if (edge_config & 4) vert_list[2] = sdfInterp(points[2], points[3], sdf_vals[2], sdf_vals[3]);
-    if (edge_config & 8) vert_list[3] = sdfInterp(points[3], points[0], sdf_vals[3], sdf_vals[0]);
-    if (edge_config & 16) vert_list[4] = sdfInterp(points[4], points[5], sdf_vals[4], sdf_vals[5]);
-    if (edge_config & 32) vert_list[5] = sdfInterp(points[5], points[6], sdf_vals[5], sdf_vals[6]);
-    if (edge_config & 64) vert_list[6] = sdfInterp(points[6], points[7], sdf_vals[6], sdf_vals[7]);
-    if (edge_config & 128) vert_list[7] = sdfInterp(points[7], points[4], sdf_vals[7], sdf_vals[4]);
-    if (edge_config & 256) vert_list[8] = sdfInterp(points[0], points[4], sdf_vals[0], sdf_vals[4]);
-    if (edge_config & 512) vert_list[9] = sdfInterp(points[1], points[5], sdf_vals[1], sdf_vals[5]);
-    if (edge_config & 1024) vert_list[10] = sdfInterp(points[2], points[6], sdf_vals[2], sdf_vals[6]);
-    if (edge_config & 2048) vert_list[11] = sdfInterp(points[3], points[7], sdf_vals[3], sdf_vals[7]);
+__hostdev__ static inline void
+fillVertList(nanovdb::math::Vec4<ScalarT> *vert_list, int edge_config,
+             nanovdb::math::Vec3<ScalarT> *points, ScalarT *sdf_vals) {
+    if (edge_config & 1)
+        vert_list[0] = sdfInterp(points[0], points[1], sdf_vals[0], sdf_vals[1]);
+    if (edge_config & 2)
+        vert_list[1] = sdfInterp(points[1], points[2], sdf_vals[1], sdf_vals[2]);
+    if (edge_config & 4)
+        vert_list[2] = sdfInterp(points[2], points[3], sdf_vals[2], sdf_vals[3]);
+    if (edge_config & 8)
+        vert_list[3] = sdfInterp(points[3], points[0], sdf_vals[3], sdf_vals[0]);
+    if (edge_config & 16)
+        vert_list[4] = sdfInterp(points[4], points[5], sdf_vals[4], sdf_vals[5]);
+    if (edge_config & 32)
+        vert_list[5] = sdfInterp(points[5], points[6], sdf_vals[5], sdf_vals[6]);
+    if (edge_config & 64)
+        vert_list[6] = sdfInterp(points[6], points[7], sdf_vals[6], sdf_vals[7]);
+    if (edge_config & 128)
+        vert_list[7] = sdfInterp(points[7], points[4], sdf_vals[7], sdf_vals[4]);
+    if (edge_config & 256)
+        vert_list[8] = sdfInterp(points[0], points[4], sdf_vals[0], sdf_vals[4]);
+    if (edge_config & 512)
+        vert_list[9] = sdfInterp(points[1], points[5], sdf_vals[1], sdf_vals[5]);
+    if (edge_config & 1024)
+        vert_list[10] = sdfInterp(points[2], points[6], sdf_vals[2], sdf_vals[6]);
+    if (edge_config & 2048)
+        vert_list[11] = sdfInterp(points[3], points[7], sdf_vals[3], sdf_vals[7]);
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void countVerticesCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                       BatchGridAccessor<GridType> batchAccessor,
-                                       TensorAccessor<ScalarType, 1> sdf, ScalarType level,
-                                       TensorAccessor<int64_t, 1> nVertices) {
-    const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+countVerticesCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                      BatchGridAccessor<GridType> batchAccessor, TensorAccessor<ScalarType, 1> sdf,
+                      ScalarType level, TensorAccessor<int64_t, 1> nVertices) {
+    const nanovdb::NanoGrid<GridType>                        *grid = batchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
     const nanovdb::Coord ijk = leaf.offsetToGlobalCoord(voxelIdx);
 
-    auto base = leaf.getValue(voxelIdx) - 1 + batchAccessor.voxelOffset(batchIdx);
+    auto base    = leaf.getValue(voxelIdx) - 1 + batchAccessor.voxelOffset(batchIdx);
     auto gridAcc = grid->getAccessor();
 
     ScalarType sdfValues[8];
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < 8; ++i) {
-        nanovdb::Coord vCoord = ijk + nanovdb::Coord(marchingCubesCubeRelTable[i][0],
-                                                     marchingCubesCubeRelTable[i][1],
-                                                     marchingCubesCubeRelTable[i][2]);
+        nanovdb::Coord vCoord =
+            ijk + nanovdb::Coord(marchingCubesCubeRelTable[i][0], marchingCubesCubeRelTable[i][1],
+                                 marchingCubesCubeRelTable[i][2]);
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(vCoord)) {
-            sdfValues[i] = sdf[batchAccessor.voxelOffset(batchIdx) + gridAcc.getValue(vCoord) - 1] - level;
+            sdfValues[i] =
+                sdf[batchAccessor.voxelOffset(batchIdx) + gridAcc.getValue(vCoord) - 1] - level;
         } else {
             // Incomplete cube, return
             return;
         }
     }
 
-    int cubeType = getCubeType(sdfValues);
+    int cubeType    = getCubeType(sdfValues);
     nVertices[base] = marchingCubesNumVertsTable[cubeType];
 }
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void meshingCubeCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                     BatchGridAccessor<GridType> batchAccessor,
-                                     TensorAccessor<ScalarType, 1> sdf, ScalarType level,
-                                     TensorAccessor<int64_t, 1> countCsum,
-                                     TensorAccessor<ScalarType, 3> triangles,
-                                     TensorAccessor<int64_t, 3> vertIds) {
-    const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
-    const nanovdb::Coord ijk = leaf.offsetToGlobalCoord(voxelIdx);
-    VoxelCoordTransform transform = batchAccessor.primalTransform(batchIdx);
-
-    auto base = leaf.getValue(voxelIdx) - 1 + batchAccessor.voxelOffset(batchIdx);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+meshingCubeCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+                    BatchGridAccessor<GridType> batchAccessor, TensorAccessor<ScalarType, 1> sdf,
+                    ScalarType level, TensorAccessor<int64_t, 1> countCsum,
+                    TensorAccessor<ScalarType, 3> triangles, TensorAccessor<int64_t, 3> vertIds) {
+    const nanovdb::NanoGrid<GridType>                        *grid = batchAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
+    const nanovdb::Coord ijk       = leaf.offsetToGlobalCoord(voxelIdx);
+    VoxelCoordTransform  transform = batchAccessor.primalTransform(batchIdx);
+
+    auto base    = leaf.getValue(voxelIdx) - 1 + batchAccessor.voxelOffset(batchIdx);
     auto gridAcc = grid->getAccessor();
 
-    ScalarType sdfValues[8];
-    int64_t pointIds[8];
+    ScalarType                      sdfValues[8];
+    int64_t                         pointIds[8];
     nanovdb::math::Vec3<ScalarType> points[8];
 
 #pragma unroll
     for (int i = 0; i < 8; ++i) {
-        nanovdb::Coord vCoord = ijk + nanovdb::Coord(marchingCubesCubeRelTable[i][0],
-                                                     marchingCubesCubeRelTable[i][1],
-                                                     marchingCubesCubeRelTable[i][2]);
+        nanovdb::Coord vCoord =
+            ijk + nanovdb::Coord(marchingCubesCubeRelTable[i][0], marchingCubesCubeRelTable[i][1],
+                                 marchingCubesCubeRelTable[i][2]);
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(vCoord)) {
-            pointIds[i] = batchAccessor.voxelOffset(batchIdx) + gridAcc.getValue(vCoord) - 1;
+            pointIds[i]  = batchAccessor.voxelOffset(batchIdx) + gridAcc.getValue(vCoord) - 1;
             sdfValues[i] = sdf[pointIds[i]] - level;
-            points[i] = transform.applyInv(static_cast<ScalarType>(vCoord[0]),
-                                           static_cast<ScalarType>(vCoord[1]),
-                                           static_cast<ScalarType>(vCoord[2]));
+            points[i]    = transform.applyInv(static_cast<ScalarType>(vCoord[0]),
+                                              static_cast<ScalarType>(vCoord[1]),
+                                              static_cast<ScalarType>(vCoord[2]));
         } else {
             // Incomplete cube, return
             return;
         }
     }
 
-    int cubeType = getCubeType(sdfValues);
+    int cubeType   = getCubeType(sdfValues);
     int edgeConfig = marchingCubesEdgeTable[cubeType];
-    if (edgeConfig == 0) return;
+    if (edgeConfig == 0)
+        return;
 
     nanovdb::math::Vec4<ScalarType> vertList[12];
     fillVertList(vertList, edgeConfig, points, sdfValues);
@@ -141,7 +165,7 @@ __hostdev__ void meshingCubeCallback(int32_t batchIdx, int32_t leafIdx, int32_t
     // Write triangles to array.
     for (int i = 0; marchingCubesTriTable[cubeType][i] != -1; i += 3) {
         int64_t triangleIdx = countCsum[base] / 3 + i / 3;
-        #pragma unroll
+#pragma unroll
         for (int vi = 0; vi < 3; ++vi) {
             int64_t vlid = marchingCubesTriTable[cubeType][i + vi];
             for (int d = 0; d < 3; ++d) {
@@ -150,138 +174,150 @@ __hostdev__ void meshingCubeCallback(int32_t batchIdx, int32_t leafIdx, int32_t
             int64_t vid0 = pointIds[marchingCubesE2iTable[vlid][0]];
             int64_t vid1 = pointIds[marchingCubesE2iTable[vlid][1]];
             if (vid0 < vid1) {
-                int64_t t = vid1; vid1 = vid0; vid0 = t;
+                int64_t t = vid1;
+                vid1      = vid0;
+                vid0      = t;
             }
             vertIds[triangleIdx][vi][0] = batchIdx;
             vertIds[triangleIdx][vi][1] = vid0;
             vertIds[triangleIdx][vi][2] = vid1;
         }
     }
-
 }
 
 template <c10::DeviceType DeviceTag>
-std::vector<JaggedTensor> MarchingCubes(const GridBatchImpl& batchHdl,
-                                              const torch::Tensor& sdf,
-                                              double level) {
-
+std::vector<JaggedTensor>
+MarchingCubes(const GridBatchImpl &batchHdl, const torch::Tensor &sdf, double level) {
     batchHdl.checkDevice(sdf);
     TORCH_CHECK_TYPE(sdf.is_floating_point(), "field must have a floating point type");
-    TORCH_CHECK(sdf.dim() == 1, std::string("Expected field to have 1 dimension (shape (n,)) but got ") +
-                                std::to_string(sdf.dim()) + " dimensions");
+    TORCH_CHECK(sdf.dim() == 1,
+                std::string("Expected field to have 1 dimension (shape (n,)) but got ") +
+                    std::to_string(sdf.dim()) + " dimensions");
 
-    auto longOpts = torch::TensorOptions().dtype(torch::kLong).device(sdf.device());
-    auto scalarOpts = torch::TensorOptions().dtype(sdf.dtype()).device(sdf.device());
-    torch::Tensor nVertices = torch::zeros({sdf.size(0)}, longOpts);
+    auto          longOpts   = torch::TensorOptions().dtype(torch::kLong).device(sdf.device());
+    auto          scalarOpts = torch::TensorOptions().dtype(sdf.dtype()).device(sdf.device());
+    torch::Tensor nVertices  = torch::zeros({ sdf.size(0) }, longOpts);
 
     // Count the number of vertices
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(sdf.scalar_type(), "countVertices", ([&] {
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto sdfAcc = tensorAccessor<DeviceTag, scalar_t, 1>(sdf);
-            auto nVerticesAcc = tensorAccessor<DeviceTag, int64_t, 1>(nVertices);
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, BatchGridAccessor<GridType> batchAcc) {
-                    countVerticesCallback<scalar_t, GridType, TorchRAcc32>(
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            sdf.scalar_type(), "countVertices", ([&] {
+                auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto sdfAcc       = tensorAccessor<DeviceTag, scalar_t, 1>(sdf);
+                auto nVerticesAcc = tensorAccessor<DeviceTag, int64_t, 1>(nVertices);
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                             BatchGridAccessor<GridType> batchAcc) {
+                        countVerticesCallback<scalar_t, GridType, TorchRAcc32>(
                             bidx, lidx, vidx, cidx, batchAcc, sdfAcc, static_cast<scalar_t>(level),
                             nVerticesAcc);
-                };
-                forEachVoxelCUDA<GridType>(128, 1, batchHdl, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, BatchGridAccessor<GridType> batchAcc) {
-                    countVerticesCallback<scalar_t, GridType, TorchAcc>(
+                    };
+                    forEachVoxelCUDA<GridType>(128, 1, batchHdl, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                  BatchGridAccessor<GridType> batchAcc) {
+                        countVerticesCallback<scalar_t, GridType, TorchAcc>(
                             bidx, lidx, vidx, cidx, batchAcc, sdfAcc, static_cast<scalar_t>(level),
                             nVerticesAcc);
-                };
-                forEachVoxelCPU<GridType>(1, batchHdl, cb);
-            }
-        }));
+                    };
+                    forEachVoxelCPU<GridType>(1, batchHdl, cb);
+                }
+            }));
     });
 
     // cumsum to determine starting position.
-    torch::Tensor countCsum = torch::cumsum(nVertices, 0);
-    int64_t nTriangles = countCsum[-1].item<int64_t>() / 3;
-    countCsum = torch::roll(countCsum, torch::IntList(1));
-    countCsum[0] = 0;
+    torch::Tensor countCsum  = torch::cumsum(nVertices, 0);
+    int64_t       nTriangles = countCsum[-1].item<int64_t>() / 3;
+    countCsum                = torch::roll(countCsum, torch::IntList(1));
+    countCsum[0]             = 0;
 
     // Generate triangles
-    torch::Tensor triangles = torch::empty({nTriangles, 3, 3}, scalarOpts);
-    torch::Tensor vertIds = torch::empty({nTriangles, 3, 3}, longOpts);
+    torch::Tensor triangles = torch::empty({ nTriangles, 3, 3 }, scalarOpts);
+    torch::Tensor vertIds   = torch::empty({ nTriangles, 3, 3 }, longOpts);
 
     if (nTriangles > 0) {
         FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-            AT_DISPATCH_FLOATING_TYPES_AND_HALF(sdf.scalar_type(), "meshingCubes", ([&] {
-                auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-                auto sdfAcc = tensorAccessor<DeviceTag, scalar_t, 1>(sdf);
-                auto countCsumAcc = tensorAccessor<DeviceTag, int64_t, 1>(countCsum);
-                auto trianglesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(triangles);
-                auto vertIdsAcc = tensorAccessor<DeviceTag, int64_t, 3>(vertIds);
-
-                if constexpr (DeviceTag == torch::kCUDA) {
-                    auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                            BatchGridAccessor<GridType> batchAcc) {
-                        meshingCubeCallback<scalar_t, GridType, TorchRAcc32>(
-                                bidx, lidx, vidx, cidx, batchAcc, sdfAcc, static_cast<scalar_t>(level),
-                                countCsumAcc, trianglesAcc, vertIdsAcc);
-                    };
-                    forEachVoxelCUDA<GridType>(128, 1, batchHdl, cb);
-                } else {
-                    auto cb = [=] (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                            BatchGridAccessor<GridType> batchAcc) {
-                        meshingCubeCallback<scalar_t, GridType, TorchAcc>(
-                                bidx, lidx, vidx, cidx, batchAcc, sdfAcc, static_cast<scalar_t>(level),
-                                countCsumAcc, trianglesAcc, vertIdsAcc);
-                    };
-                    forEachVoxelCPU<GridType>(1, batchHdl, cb);
-                }
-            }));
+            AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+                sdf.scalar_type(), "meshingCubes", ([&] {
+                    auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                    auto sdfAcc       = tensorAccessor<DeviceTag, scalar_t, 1>(sdf);
+                    auto countCsumAcc = tensorAccessor<DeviceTag, int64_t, 1>(countCsum);
+                    auto trianglesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(triangles);
+                    auto vertIdsAcc   = tensorAccessor<DeviceTag, int64_t, 3>(vertIds);
+
+                    if constexpr (DeviceTag == torch::kCUDA) {
+                        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx,
+                                                 int32_t                     cidx,
+                                                 BatchGridAccessor<GridType> batchAcc) {
+                            meshingCubeCallback<scalar_t, GridType, TorchRAcc32>(
+                                bidx, lidx, vidx, cidx, batchAcc, sdfAcc,
+                                static_cast<scalar_t>(level), countCsumAcc, trianglesAcc,
+                                vertIdsAcc);
+                        };
+                        forEachVoxelCUDA<GridType>(128, 1, batchHdl, cb);
+                    } else {
+                        auto cb = [=](int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                      BatchGridAccessor<GridType> batchAcc) {
+                            meshingCubeCallback<scalar_t, GridType, TorchAcc>(
+                                bidx, lidx, vidx, cidx, batchAcc, sdfAcc,
+                                static_cast<scalar_t>(level), countCsumAcc, trianglesAcc,
+                                vertIdsAcc);
+                        };
+                        forEachVoxelCPU<GridType>(1, batchHdl, cb);
+                    }
+                }));
         });
     }
 
     // Flatten
-    triangles = triangles.view({-1, 3});
-    vertIds = vertIds.view({-1, 3});
+    triangles = triangles.view({ -1, 3 });
+    vertIds   = vertIds.view({ -1, 3 });
 
     // Merge triangles by detecting the same vertex position.
     //  (sort to keep lexicographical order with batch-dim first)
-    auto unqRet = torch::unique_dim(vertIds, 0, true, true);
-    torch::Tensor unqVertIdx = std::get<0>(unqRet);
+    auto          unqRet       = torch::unique_dim(vertIds, 0, true, true);
+    torch::Tensor unqVertIdx   = std::get<0>(unqRet);
     torch::Tensor unqTriangles = std::get<1>(unqRet);
 
-    torch::Tensor vertices = torch::zeros({unqVertIdx.size(0), 3}, scalarOpts);
-    vertices.index_put_({unqTriangles}, triangles);
+    torch::Tensor vertices = torch::zeros({ unqVertIdx.size(0), 3 }, scalarOpts);
+    vertices.index_put_({ unqTriangles }, triangles);
 
     // Compute batch index for vertices and triangles
-    unqTriangles = unqTriangles.view({-1, 3});
-    torch::Tensor vBatchIdx = unqVertIdx.index({torch::indexing::Slice(), 0}).to(fvdb::JIdxScalarType);
-    torch::Tensor tBatchIdx = vBatchIdx.index({unqTriangles.index({torch::indexing::Slice(), 0})}).to(fvdb::JIdxScalarType);
-
-    JaggedTensor retVertices = JaggedTensor::from_data_indices_and_list_ids(vertices, vBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
-    JaggedTensor retTriangles = JaggedTensor::from_data_indices_and_list_ids(unqTriangles, tBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
-    JaggedTensor retUniqueVertices = JaggedTensor::from_data_indices_and_list_ids(unqVertIdx, vBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
+    unqTriangles = unqTriangles.view({ -1, 3 });
+    torch::Tensor vBatchIdx =
+        unqVertIdx.index({ torch::indexing::Slice(), 0 }).to(fvdb::JIdxScalarType);
+    torch::Tensor tBatchIdx =
+        vBatchIdx.index({ unqTriangles.index({ torch::indexing::Slice(), 0 }) })
+            .to(fvdb::JIdxScalarType);
+
+    JaggedTensor retVertices = JaggedTensor::from_data_indices_and_list_ids(
+        vertices, vBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
+    JaggedTensor retTriangles = JaggedTensor::from_data_indices_and_list_ids(
+        unqTriangles, tBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
+    JaggedTensor retUniqueVertices = JaggedTensor::from_data_indices_and_list_ids(
+        unqVertIdx, vBatchIdx, batchHdl.jlidx(), batchHdl.batchSize());
 
     // Fix triangle indices per mesh
     int64_t cumNumVerts = 0;
     for (int i = 1; i < batchHdl.batchSize(); i += 1) {
-        cumNumVerts += retVertices.index({i - 1}).jdata().size(0);
-        retTriangles.index({i}).jdata().sub_(cumNumVerts);
+        cumNumVerts += retVertices.index({ i - 1 }).jdata().size(0);
+        retTriangles.index({ i }).jdata().sub_(cumNumVerts);
     }
 
-    return {retVertices, retTriangles, retUniqueVertices};
+    return { retVertices, retTriangles, retUniqueVertices };
 }
 
-
 template <>
-std::vector<JaggedTensor> dispatchMarchingCubes<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                            const torch::Tensor& sdf,
-                                                            double level) {
+std::vector<JaggedTensor>
+dispatchMarchingCubes<torch::kCUDA>(const GridBatchImpl &batchHdl, const torch::Tensor &sdf,
+                                    double level) {
     return MarchingCubes<torch::kCUDA>(batchHdl, sdf, level);
 }
 
 template <>
-std::vector<JaggedTensor> dispatchMarchingCubes<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                           const torch::Tensor& sdf,
-                                                           double level) {
+std::vector<JaggedTensor>
+dispatchMarchingCubes<torch::kCPU>(const GridBatchImpl &batchHdl, const torch::Tensor &sdf,
+                                   double level) {
     return MarchingCubes<torch::kCPU>(batchHdl, sdf, level);
 }
 
diff --git a/fvdb/src/detail/ops/Ops.h b/fvdb/src/detail/ops/Ops.h
index 0596c373b0..ce5ed89ca3 100644
--- a/fvdb/src/detail/ops/Ops.h
+++ b/fvdb/src/detail/ops/Ops.h
@@ -1,404 +1,313 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_OPS_OPS_H
+#define FVDB_DETAIL_OPS_OPS_H
 
-#include <torch/extension.h>
-
-#include "detail/utils/Utils.h"
-#include "detail/GridBatchImpl.h"
-#include "Types.h"
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
+#include <detail/utils/Utils.h>
 
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <c10::DeviceType>
-JaggedTensor dispatchJaggedTensorIndex(const JaggedTensor& jt, int64_t idxVal);
+JaggedTensor dispatchJaggedTensorIndex(const JaggedTensor &jt, int64_t idxVal);
 
-template <c10::DeviceType>
-JaggedTensor dispatchJCat0(const std::vector<JaggedTensor>& tensors);
+template <c10::DeviceType> JaggedTensor dispatchJCat0(const std::vector<JaggedTensor> &tensors);
 
 template <c10::DeviceType>
 torch::Tensor dispatchJOffsetsForJIdx(torch::Tensor jidx, torch::Tensor jdata, int64_t numTensors);
 
-
 template <c10::DeviceType>
 torch::Tensor dispatchJIdxForJOffsets(torch::Tensor joffsets, int64_t numElements);
 
 template <c10::DeviceType>
-JaggedTensor dispatchEnabledMask(const GridBatchImpl& batchHdl, bool returnDisabled);
-
+JaggedTensor dispatchEnabledMask(const GridBatchImpl &batchHdl, bool returnDisabled);
 
 template <c10::DeviceType>
-torch::Tensor dispatchJIdxForGrid(const GridBatchImpl& batchHdl, bool ignoreDisabledVoxels);
-
+torch::Tensor dispatchJIdxForGrid(const GridBatchImpl &batchHdl, bool ignoreDisabledVoxels);
 
 template <c10::DeviceType>
-nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK(const JaggedTensor& ijk,
-                                                                       bool isMutable);
+nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromIJK(const JaggedTensor &ijk,
+                                                                     bool                isMutable);
 
 template <c10::DeviceType>
-nanovdb::GridHandle<TorchDeviceBuffer> dispatchCreateNanoGridFromDense(uint32_t batchSize,
-                                                                         nanovdb::Coord origin,
-                                                                         nanovdb::Coord size,
-                                                                         bool isMutable,
-                                                                         torch::Device device,
-                                                                         const torch::optional<torch::Tensor>& maybeMask);
+nanovdb::GridHandle<TorchDeviceBuffer>
+dispatchCreateNanoGridFromDense(uint32_t batchSize, nanovdb::Coord origin, nanovdb::Coord size,
+                                bool isMutable, torch::Device device,
+                                const torch::optional<torch::Tensor> &maybeMask);
 template <c10::DeviceType>
-void dispatchPopulateGridMetadata(const nanovdb::GridHandle<TorchDeviceBuffer>& batchHdl,
-                                  const std::vector<nanovdb::Vec3d>& voxelSizes,
-                                  const std::vector<nanovdb::Vec3d>& voxelOrigins,
-                                  const bool isMutable,
-                                  torch::Tensor& outBatchOffsets,
-                                  GridBatchImpl::GridMetadata* outPerGridMetadataHost,
-                                  GridBatchImpl::GridMetadata* outPerGridMetadataDevice,
-                                  GridBatchImpl::GridBatchMetadata* outBatchMetadataHost,
-                                  GridBatchImpl::GridBatchMetadata* outBatchMetadataDevice) ;
-
+void dispatchPopulateGridMetadata(const nanovdb::GridHandle<TorchDeviceBuffer> &batchHdl,
+                                  const std::vector<nanovdb::Vec3d>            &voxelSizes,
+                                  const std::vector<nanovdb::Vec3d>            &voxelOrigins,
+                                  const bool isMutable, torch::Tensor &outBatchOffsets,
+                                  GridBatchImpl::GridMetadata      *outPerGridMetadataHost,
+                                  GridBatchImpl::GridMetadata      *outPerGridMetadataDevice,
+                                  GridBatchImpl::GridBatchMetadata *outBatchMetadataHost,
+                                  GridBatchImpl::GridBatchMetadata *outBatchMetadataDevice);
 
 template <c10::DeviceType>
-void dispatchReadIntoDense(const GridBatchImpl& batchHdl,
-                           const torch::Tensor& inGridData,
-                           const torch::Tensor& denseOrigins,
-                           torch::Tensor& outDenseTensor,
+void dispatchReadIntoDense(const GridBatchImpl &batchHdl, const torch::Tensor &inGridData,
+                           const torch::Tensor &denseOrigins, torch::Tensor &outDenseTensor,
                            bool ignoreMasked);
 
-
 template <c10::DeviceType>
-void dispatchReadFromDense(const GridBatchImpl& batchHdl,
-                           const torch::Tensor& inDenseTensor,
-                           const torch::Tensor& denseOrigins,
-                           torch::Tensor& outSparseTensor,
+void dispatchReadFromDense(const GridBatchImpl &batchHdl, const torch::Tensor &inDenseTensor,
+                           const torch::Tensor &denseOrigins, torch::Tensor &outSparseTensor,
                            bool ignoreMasked);
 
 template <c10::DeviceType>
-void dispatchFillToGrid(const GridBatchImpl& fromGrid,
-                        const GridBatchImpl& toGrid,
-                        const torch::Tensor& fromFeatures,
-                        torch::Tensor& toFeatures);
+void dispatchFillToGrid(const GridBatchImpl &fromGrid, const GridBatchImpl &toGrid,
+                        const torch::Tensor &fromFeatures, torch::Tensor &toFeatures);
 
 template <c10::DeviceType>
-JaggedTensor dispatchIjkToInvIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative);
-
+JaggedTensor dispatchIjkToInvIndex(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                   bool cumulative);
 
 template <c10::DeviceType>
-JaggedTensor dispatchRayImplicitIntersection(const GridBatchImpl& batchHdl,
-                                             const JaggedTensor& rayOrigins,
-                                             const JaggedTensor& rayDirections,
-                                             const JaggedTensor& gridScalars,
-                                             float eps);
-
+JaggedTensor dispatchRayImplicitIntersection(const GridBatchImpl &batchHdl,
+                                             const JaggedTensor  &rayOrigins,
+                                             const JaggedTensor  &rayDirections,
+                                             const JaggedTensor &gridScalars, float eps);
 
 template <c10::DeviceType>
-JaggedTensor dispatchCoordsInGrid(const GridBatchImpl& batchHdl,
-                                  const JaggedTensor& coords, bool ignoreMasked);
-
+JaggedTensor dispatchCoordsInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                  bool ignoreMasked);
 
 template <c10::DeviceType>
-int64_t dispatchCountEnabledVoxels(const GridBatchImpl& batchHdl, int batchIdx = -1);
+int64_t dispatchCountEnabledVoxels(const GridBatchImpl &batchHdl, int batchIdx = -1);
 
 template <c10::DeviceType>
-JaggedTensor dispatchActiveVoxelsInBoundsMask(const GridBatchImpl& batchHdl,
-                                              const Vec3iBatch& ijkMin,
-                                              const Vec3iBatch& ijkMax,
+JaggedTensor dispatchActiveVoxelsInBoundsMask(const GridBatchImpl &batchHdl,
+                                              const Vec3iBatch &ijkMin, const Vec3iBatch &ijkMax,
                                               bool ignoreDisabledVoxels);
 
 template <c10::DeviceType>
-void dispatchSetMaskedIjk(const GridBatchImpl& batchHdl,
-                          const JaggedTensor& coords,
+void dispatchSetMaskedIjk(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
                           bool maskedState);
 
-
 template <c10::DeviceType>
-std::vector<JaggedTensor> dispatchGridEdgeNetwork(const GridBatchImpl& gridHdl, bool returnVoxelCoordinates);
-
+std::vector<JaggedTensor> dispatchGridEdgeNetwork(const GridBatchImpl &gridHdl,
+                                                  bool                 returnVoxelCoordinates);
 
 template <c10::DeviceType>
-JaggedTensor dispatchUniformRaySamples(const GridBatchImpl& batchHdl,
-                                       const JaggedTensor& rayO,
-                                       const JaggedTensor& rayD,
-                                       const JaggedTensor& tMin,
-                                       const JaggedTensor& tMax,
-                                       const double minStepSize,
-                                       const double coneAngle,
-                                       const bool includeEndSegments,
-                                       const bool return_midpoint,
-                                       const double eps);
-
+JaggedTensor dispatchUniformRaySamples(const GridBatchImpl &batchHdl, const JaggedTensor &rayO,
+                                       const JaggedTensor &rayD, const JaggedTensor &tMin,
+                                       const JaggedTensor &tMax, const double minStepSize,
+                                       const double coneAngle, const bool includeEndSegments,
+                                       const bool return_midpoint, const double eps);
 
 template <c10::DeviceType>
-torch::Tensor dispatchDownsampleGridMaxPoolBackward(const GridBatchImpl& coarseBatchHdl,
-                                                    const GridBatchImpl& fineBatchHdl,
-                                                    const torch::Tensor& fineData,
-                                                    const torch::Tensor& coarseGradOut,
-                                                    nanovdb::Coord poolingFactor,
-                                                    nanovdb::Coord stride);
-
+torch::Tensor dispatchDownsampleGridMaxPoolBackward(const GridBatchImpl &coarseBatchHdl,
+                                                    const GridBatchImpl &fineBatchHdl,
+                                                    const torch::Tensor &fineData,
+                                                    const torch::Tensor &coarseGradOut,
+                                                    nanovdb::Coord       poolingFactor,
+                                                    nanovdb::Coord       stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchDownsampleGridMaxPool(const GridBatchImpl& fineBatchHdl,
-                                            const GridBatchImpl& coarseBatchHdl,
-                                            const torch::Tensor& fineData,
-                                            nanovdb::Coord poolingFactor,
-                                            nanovdb::Coord stride);
-
+torch::Tensor dispatchDownsampleGridMaxPool(const GridBatchImpl &fineBatchHdl,
+                                            const GridBatchImpl &coarseBatchHdl,
+                                            const torch::Tensor &fineData,
+                                            nanovdb::Coord poolingFactor, nanovdb::Coord stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchDownsampleGridAvgPoolBackward(const GridBatchImpl& coarseBatchHdl,
-                                                    const GridBatchImpl& fineBatchHdl,
-                                                    const torch::Tensor& fineData,
-                                                    const torch::Tensor& coarseGradOut,
-                                                    nanovdb::Coord poolingFactor,
-                                                    nanovdb::Coord stride);
-
+torch::Tensor dispatchDownsampleGridAvgPoolBackward(const GridBatchImpl &coarseBatchHdl,
+                                                    const GridBatchImpl &fineBatchHdl,
+                                                    const torch::Tensor &fineData,
+                                                    const torch::Tensor &coarseGradOut,
+                                                    nanovdb::Coord       poolingFactor,
+                                                    nanovdb::Coord       stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchDownsampleGridAvgPool(const GridBatchImpl& fineBatchHdl,
-                                            const GridBatchImpl& coarseBatchHdl,
-                                            const torch::Tensor& fineData,
-                                            nanovdb::Coord poolingFactor,
-                                            nanovdb::Coord stride);
-
+torch::Tensor dispatchDownsampleGridAvgPool(const GridBatchImpl &fineBatchHdl,
+                                            const GridBatchImpl &coarseBatchHdl,
+                                            const torch::Tensor &fineData,
+                                            nanovdb::Coord poolingFactor, nanovdb::Coord stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchUpsampleGridNearest(const GridBatchImpl& coarseBatchHdl,
-                                          const GridBatchImpl& fineBatchHdl,
-                                          const torch::Tensor& coarseData,
-                                          nanovdb::Coord upsamplingFactor);
-
+torch::Tensor
+dispatchUpsampleGridNearest(const GridBatchImpl &coarseBatchHdl, const GridBatchImpl &fineBatchHdl,
+                            const torch::Tensor &coarseData, nanovdb::Coord upsamplingFactor);
 
 template <c10::DeviceType>
-torch::Tensor dispatchUpsampleGridNearestBackward(const GridBatchImpl& fineBatchHdl,
-                                                  const GridBatchImpl& coarseBatchHdl,
-                                                  const torch::Tensor& gradOut,
-                                                  const torch::Tensor& coarseData,
-                                                  nanovdb::Coord upsamplingFactor);
-
+torch::Tensor dispatchUpsampleGridNearestBackward(const GridBatchImpl &fineBatchHdl,
+                                                  const GridBatchImpl &coarseBatchHdl,
+                                                  const torch::Tensor &gradOut,
+                                                  const torch::Tensor &coarseData,
+                                                  nanovdb::Coord       upsamplingFactor);
 
 template <c10::DeviceType>
-JaggedTensor dispatchVoxelNeighborhood(const GridBatchImpl& batchHdl,
-                                       const JaggedTensor& coords,
-                                       nanovdb::Coord extentMin,
-                                       nanovdb::Coord extentMax,
+JaggedTensor dispatchVoxelNeighborhood(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                       nanovdb::Coord extentMin, nanovdb::Coord extentMax,
                                        int32_t shift);
 
-
 template <c10::DeviceType>
-JaggedTensor dispatchIjkToIndex(const GridBatchImpl& batchHdl, const JaggedTensor& ijk, bool cumulative);
-
+JaggedTensor dispatchIjkToIndex(const GridBatchImpl &batchHdl, const JaggedTensor &ijk,
+                                bool cumulative);
 
 template <c10::DeviceType>
-JaggedTensor dispatchPointsInGrid(const GridBatchImpl& batchHdl,
-                                  const JaggedTensor& points,
+JaggedTensor dispatchPointsInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &points,
                                   bool ignoreMasked);
 
-
 template <c10::DeviceType>
-JaggedTensor dispatchCubesInGrid(const GridBatchImpl& batchHdl,
-                                 const JaggedTensor& cubeCenters,
-                                 const Vec3dOrScalar& padMin,
-                                 const Vec3dOrScalar& padMax,
+JaggedTensor dispatchCubesInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &cubeCenters,
+                                 const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
                                  bool ignoreDisabledVoxels);
 
 template <c10::DeviceType>
-JaggedTensor dispatchCubesIntersectGrid(const GridBatchImpl& batchHdl,
-                                        const JaggedTensor& cubeCenters,
-                                        const Vec3dOrScalar& padMin,
-                                        const Vec3dOrScalar& padMax,
+JaggedTensor dispatchCubesIntersectGrid(const GridBatchImpl &batchHdl,
+                                        const JaggedTensor  &cubeCenters,
+                                        const Vec3dOrScalar &padMin, const Vec3dOrScalar &padMax,
                                         bool ignoreDisabledVoxels);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchSampleGridTrilinear(const GridBatchImpl& batchHdl,
-                                                       const JaggedTensor& points,
-                                                       const torch::Tensor& gridData);
-
+std::vector<torch::Tensor> dispatchSampleGridTrilinear(const GridBatchImpl &batchHdl,
+                                                       const JaggedTensor  &points,
+                                                       const torch::Tensor &gridData);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchSampleGridTrilinearWithGrad(const GridBatchImpl& batchHdl,
-                                                               const JaggedTensor& points,
-                                                               const torch::Tensor& gridData);
-
+std::vector<torch::Tensor> dispatchSampleGridTrilinearWithGrad(const GridBatchImpl &batchHdl,
+                                                               const JaggedTensor  &points,
+                                                               const torch::Tensor &gridData);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSampleGridTrilinearWithGradBackward(const GridBatchImpl& batchHdl,
-                                                          const JaggedTensor& points,
-                                                          const torch::Tensor& data,
-                                                          const torch::Tensor& gradOutFeatures,
-                                                          const torch::Tensor& gradOutGradFeatures);
-
+torch::Tensor dispatchSampleGridTrilinearWithGradBackward(const GridBatchImpl &batchHdl,
+                                                          const JaggedTensor  &points,
+                                                          const torch::Tensor &data,
+                                                          const torch::Tensor &gradOutFeatures,
+                                                          const torch::Tensor &gradOutGradFeatures);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSplatIntoGridTrilinear(const GridBatchImpl& batchHdl,
-                                             const JaggedTensor& points,
-                                             const torch::Tensor& gridData);
-
+torch::Tensor dispatchSplatIntoGridTrilinear(const GridBatchImpl &batchHdl,
+                                             const JaggedTensor  &points,
+                                             const torch::Tensor &gridData);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchSampleGridBezier(const GridBatchImpl& batchHdl,
-                                                    const JaggedTensor& points,
-                                                    const torch::Tensor& gridData);
-
+std::vector<torch::Tensor> dispatchSampleGridBezier(const GridBatchImpl &batchHdl,
+                                                    const JaggedTensor  &points,
+                                                    const torch::Tensor &gridData);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchSampleGridBezierWithGrad(const GridBatchImpl& batchHdl,
-                                                            const JaggedTensor& points,
-                                                            const torch::Tensor& gridData);
-
+std::vector<torch::Tensor> dispatchSampleGridBezierWithGrad(const GridBatchImpl &batchHdl,
+                                                            const JaggedTensor  &points,
+                                                            const torch::Tensor &gridData);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSampleGridBezierWithGradBackward(const GridBatchImpl& batchHdl,
-                                                       const JaggedTensor& points,
-                                                       const torch::Tensor& gradOutFeatures,
-                                                       const torch::Tensor& gradOutGradFeatures,
-                                                       const torch::Tensor& data);
-
+torch::Tensor dispatchSampleGridBezierWithGradBackward(const GridBatchImpl &batchHdl,
+                                                       const JaggedTensor  &points,
+                                                       const torch::Tensor &gradOutFeatures,
+                                                       const torch::Tensor &gradOutGradFeatures,
+                                                       const torch::Tensor &data);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSplatIntoGridBezier(const GridBatchImpl& batchHdl,
-                                          const JaggedTensor& points,
-                                          const torch::Tensor& pointsData);
-
+torch::Tensor dispatchSplatIntoGridBezier(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                          const torch::Tensor &pointsData);
 
 template <c10::DeviceType>
-std::vector<JaggedTensor> dispatchVoxelsAlongRays(const GridBatchImpl& batchHdl,
-                                                  const JaggedTensor& rayOrigins,
-                                                  const JaggedTensor& rayDirections,
-                                                  int64_t maxVox,
-                                                  float eps,
-                                                  bool returnIjk,
-                                                  bool cumulative);
-
-
-
+std::vector<JaggedTensor> dispatchVoxelsAlongRays(const GridBatchImpl &batchHdl,
+                                                  const JaggedTensor  &rayOrigins,
+                                                  const JaggedTensor &rayDirections, int64_t maxVox,
+                                                  float eps, bool returnIjk, bool cumulative);
 
 template <c10::DeviceType>
-JaggedTensor dispatchSegmentsAlongRays(const GridBatchImpl& batchHdl,
-                                       const JaggedTensor& rayOrigins,
-                                       const JaggedTensor& rayDirections,
-                                       int64_t maxSegments,
-                                       const double eps,
-                                       const bool ignoreMasked);
-
+JaggedTensor dispatchSegmentsAlongRays(const GridBatchImpl &batchHdl,
+                                       const JaggedTensor  &rayOrigins,
+                                       const JaggedTensor &rayDirections, int64_t maxSegments,
+                                       const double eps, const bool ignoreMasked);
 
 template <c10::DeviceType>
-JaggedTensor dispatchActiveGridCoords(const GridBatchImpl& gridAccessor, bool ignoreDisabledVoxels);
-
+JaggedTensor dispatchActiveGridCoords(const GridBatchImpl &gridAccessor, bool ignoreDisabledVoxels);
 
 template <c10::DeviceType>
-torch::Tensor dispatchTransformPointsToGrid(const GridBatchImpl& batchHdl,
-                                            const JaggedTensor& points,
-                                            bool isPrimal);
-
+torch::Tensor dispatchTransformPointsToGrid(const GridBatchImpl &batchHdl,
+                                            const JaggedTensor &points, bool isPrimal);
 
 template <c10::DeviceType>
-torch::Tensor dispatchInvTransformPointsToGrid(const GridBatchImpl& batchHdl,
-                                               const JaggedTensor& points,
-                                               bool isPrimal);
-
+torch::Tensor dispatchInvTransformPointsToGrid(const GridBatchImpl &batchHdl,
+                                               const JaggedTensor &points, bool isPrimal);
 
 template <c10::DeviceType>
-torch::Tensor dispatchTransformPointsToGridBackward(const GridBatchImpl& batchHdl,
-                                                    const JaggedTensor& gradOut,
-                                                    bool isPrimal);
-
+torch::Tensor dispatchTransformPointsToGridBackward(const GridBatchImpl &batchHdl,
+                                                    const JaggedTensor &gradOut, bool isPrimal);
 
 template <c10::DeviceType>
-torch::Tensor dispatchInvTransformPointsToGridBackward(const GridBatchImpl& batchHdl,
-                                                       const JaggedTensor& gradOut,
-                                                       bool isPrimal);
-
+torch::Tensor dispatchInvTransformPointsToGridBackward(const GridBatchImpl &batchHdl,
+                                                       const JaggedTensor &gradOut, bool isPrimal);
 
 template <c10::DeviceType>
-void dispatchVolumeRender(const torch::Tensor sigmas,
-                          const torch::Tensor rgbs,
-                          const torch::Tensor deltas,
-                          const torch::Tensor ts,
-                          const torch::Tensor raysAcc,
-                          const float opacityThreshold,
-                          torch::Tensor& outOpacity,
-                          torch::Tensor& outDepth,
-                          torch::Tensor& outRgb,
-                          torch::Tensor& outWs,
-                          torch::Tensor& outTotalSamples);
+void dispatchVolumeRender(const torch::Tensor sigmas, const torch::Tensor rgbs,
+                          const torch::Tensor deltas, const torch::Tensor ts,
+                          const torch::Tensor raysAcc, const float opacityThreshold,
+                          torch::Tensor &outOpacity, torch::Tensor &outDepth, torch::Tensor &outRgb,
+                          torch::Tensor &outWs, torch::Tensor &outTotalSamples);
 
 template <c10::DeviceType>
-std::vector<JaggedTensor> dispatchMarchingCubes(const GridBatchImpl& batchHdl,
-                                                const torch::Tensor& sdf,
-                                                double level);
+std::vector<JaggedTensor> dispatchMarchingCubes(const GridBatchImpl &batchHdl,
+                                                const torch::Tensor &sdf, double level);
 
 template <c10::DeviceType>
-void dispatchVolumeRenderBackward(const torch::Tensor dLdOpacity,
-                                  const torch::Tensor dLdDepth,
-                                  const torch::Tensor dLdRgb,
-                                  const torch::Tensor dLdWs,
-                                  const torch::Tensor sigmas,
-                                  const torch::Tensor rgbs,
-                                  const torch::Tensor ws,
-                                  const torch::Tensor deltas,
-                                  const torch::Tensor ts,
-                                  const torch::Tensor raysAcc,
-                                  const torch::Tensor opacity,
-                                  const torch::Tensor depth,
-                                  const torch::Tensor rgb,
-                                  const float opacityThreshold,
-                                  torch::Tensor& outDLdSigmas,
-                                  torch::Tensor& outDLdRbgs);
-
+void dispatchVolumeRenderBackward(const torch::Tensor dLdOpacity, const torch::Tensor dLdDepth,
+                                  const torch::Tensor dLdRgb, const torch::Tensor dLdWs,
+                                  const torch::Tensor sigmas, const torch::Tensor rgbs,
+                                  const torch::Tensor ws, const torch::Tensor deltas,
+                                  const torch::Tensor ts, const torch::Tensor raysAcc,
+                                  const torch::Tensor opacity, const torch::Tensor depth,
+                                  const torch::Tensor rgb, const float opacityThreshold,
+                                  torch::Tensor &outDLdSigmas, torch::Tensor &outDLdRbgs);
 
 template <c10::DeviceType>
-JaggedTensor dispatchIJKForMesh(const JaggedTensor& meshVertices,
-                                const JaggedTensor& meshFaces,
-                                const std::vector<VoxelCoordTransform>& transforms);
+JaggedTensor dispatchIJKForMesh(const JaggedTensor &meshVertices, const JaggedTensor &meshFaces,
+                                const std::vector<VoxelCoordTransform> &transforms);
 
 template <c10::DeviceType>
-JaggedTensor dispatchPaddedIJKForGrid(const GridBatchImpl& batchHdl,
-                                      const nanovdb::Coord& bmin,
-                                      const nanovdb::Coord& bmax);
+JaggedTensor dispatchPaddedIJKForGrid(const GridBatchImpl &batchHdl, const nanovdb::Coord &bmin,
+                                      const nanovdb::Coord &bmax);
 
 template <c10::DeviceType>
-JaggedTensor dispatchPaddedIJKForGridWithoutBorder(const GridBatchImpl& batchHdl,
-                                                   const nanovdb::Coord& bmin,
-                                                   const nanovdb::Coord& bmax);
+JaggedTensor dispatchPaddedIJKForGridWithoutBorder(const GridBatchImpl  &batchHdl,
+                                                   const nanovdb::Coord &bmin,
+                                                   const nanovdb::Coord &bmax);
 
 template <c10::DeviceType>
-JaggedTensor dispatchPaddedIJKForPoints(const JaggedTensor& points,
-                                        const nanovdb::Coord& bmin,
-                                        const nanovdb::Coord& bmax,
-                                        const std::vector<VoxelCoordTransform>& transforms);
-
+JaggedTensor dispatchPaddedIJKForPoints(const JaggedTensor &points, const nanovdb::Coord &bmin,
+                                        const nanovdb::Coord                   &bmax,
+                                        const std::vector<VoxelCoordTransform> &transforms);
 
 template <c10::DeviceType>
-JaggedTensor dispatchPaddedIJKForCoords(const JaggedTensor& coords,
-                                        const nanovdb::Coord& bmin,
-                                        const nanovdb::Coord& bmax);
-
+JaggedTensor dispatchPaddedIJKForCoords(const JaggedTensor &coords, const nanovdb::Coord &bmin,
+                                        const nanovdb::Coord &bmax);
 
 template <c10::DeviceType>
-JaggedTensor dispatchNearestNeighborIJKForPoints(const JaggedTensor& points,
-                                                 const std::vector<VoxelCoordTransform>& transforms);
-
+JaggedTensor
+dispatchNearestNeighborIJKForPoints(const JaggedTensor                     &points,
+                                    const std::vector<VoxelCoordTransform> &transforms);
 
 template <c10::DeviceType>
-JaggedTensor dispatchCoarseIJKForFineGrid(const GridBatchImpl& batchHdl,
-                                          nanovdb::Coord coarseningFactor);
-
+JaggedTensor dispatchCoarseIJKForFineGrid(const GridBatchImpl &batchHdl,
+                                          nanovdb::Coord       coarseningFactor);
 
 template <c10::DeviceType>
-JaggedTensor dispatchFineIJKForCoarseGrid(const GridBatchImpl& batchHdl, nanovdb::Coord upsamplingFactor,
-                                          const torch::optional<JaggedTensor>& maybeMask);
+JaggedTensor dispatchFineIJKForCoarseGrid(const GridBatchImpl                 &batchHdl,
+                                          nanovdb::Coord                       upsamplingFactor,
+                                          const torch::optional<JaggedTensor> &maybeMask);
 
 template <c10::DeviceType>
-JaggedTensor dispatchConvIJKForGrid(const GridBatchImpl& batchHdl,
-                                    const nanovdb::Coord& kernelSize, const nanovdb::Coord& stride);
+JaggedTensor dispatchConvIJKForGrid(const GridBatchImpl &batchHdl, const nanovdb::Coord &kernelSize,
+                                    const nanovdb::Coord &stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchScaledDotProductAttention(const torch::Tensor& query, const torch::Tensor& key,
-                                                const torch::Tensor& value,
-                                                const torch::Tensor& qLengths, const torch::Tensor& kvLengths,
-                                                bool training, float scale);
+torch::Tensor
+dispatchScaledDotProductAttention(const torch::Tensor &query, const torch::Tensor &key,
+                                  const torch::Tensor &value, const torch::Tensor &qLengths,
+                                  const torch::Tensor &kvLengths, bool training, float scale);
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_OPS_OPS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/PaddedIJKForMesh.cu b/fvdb/src/detail/ops/PaddedIJKForMesh.cu
index dd474268f4..0b9a79a0b4 100644
--- a/fvdb/src/detail/ops/PaddedIJKForMesh.cu
+++ b/fvdb/src/detail/ops/PaddedIJKForMesh.cu
@@ -9,13 +9,12 @@
 
 #include <cub/cub.cuh>
 
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
-/// @brief forEachJaggedElementCUDA callback to count the number of voxels to generate inside each triangle
+/// @brief forEachJaggedElementCUDA callback to count the number of voxels to generate inside each
+/// triangle
 ///        and save it to a buffer
 /// @param bidx Batch index
 /// @param eidx Element index
@@ -24,58 +23,62 @@ namespace ops {
 /// @param triangles JaggedAccessor for triangle indices in each mesh in the batch
 /// @param outNumSamplesPerTri Output buffer for the number of voxels to generate in each triangle
 template <typename ScalarF, typename ScalarI>
-__device__ void countVoxelsPerTriToCheck(int32_t bidx, int32_t eidx,
-                                         const VoxelCoordTransform* transforms,
-                                         const JaggedRAcc32<ScalarF, 2> vertices,
-                                         const JaggedRAcc32<ScalarI, 2> triangles,
-                                         TorchRAcc32<int32_t, 1> outNumSamplesPerTri) {
+__device__ void
+countVoxelsPerTriToCheck(int32_t bidx, int32_t eidx, const VoxelCoordTransform *transforms,
+                         const JaggedRAcc32<ScalarF, 2> vertices,
+                         const JaggedRAcc32<ScalarI, 2> triangles,
+                         TorchRAcc32<int32_t, 1>        outNumSamplesPerTri) {
     using Vec3F = nanovdb::math::Vec3<ScalarF>;
     using Vec3I = nanovdb::math::Vec3<ScalarI>;
 
-    const VoxelCoordTransform tx = transforms[bidx];
+    const VoxelCoordTransform     tx    = transforms[bidx];
     const TorchRAcc32<ScalarI, 2> faces = triangles.data();
     const TorchRAcc32<ScalarF, 2> verts = vertices.data();
 
     // Voxel space triangle vertices
     const int64_t off = vertices.offsetStart(bidx);
-    const Vec3F v1 = tx.apply(verts[off+faces[eidx][0]][0], verts[off+faces[eidx][0]][1], verts[off+faces[eidx][0]][2]);
-    const Vec3F v2 = tx.apply(verts[off+faces[eidx][1]][0], verts[off+faces[eidx][1]][1], verts[off+faces[eidx][1]][2]);
-    const Vec3F v3 = tx.apply(verts[off+faces[eidx][2]][0], verts[off+faces[eidx][2]][1], verts[off+faces[eidx][2]][2]);
+    const Vec3F   v1  = tx.apply(verts[off + faces[eidx][0]][0], verts[off + faces[eidx][0]][1],
+                                 verts[off + faces[eidx][0]][2]);
+    const Vec3F   v2  = tx.apply(verts[off + faces[eidx][1]][0], verts[off + faces[eidx][1]][1],
+                                 verts[off + faces[eidx][1]][2]);
+    const Vec3F   v3  = tx.apply(verts[off + faces[eidx][2]][0], verts[off + faces[eidx][2]][1],
+                                 verts[off + faces[eidx][2]][2]);
 
     // Edges of triangle in voxel space
     const Vec3F e1 = v2 - v1;
     const Vec3F e2 = v3 - v1;
 
     // Spacing between samples to ensure coverage
-    const ScalarF spacing = sqrt(3.0) / 3.0;  // This is very conservative spacing but fine for now
+    const ScalarF spacing = sqrt(3.0) / 3.0; // This is very conservative spacing but fine for now
 
     // Number of samples to generate for this triangle
-    const int32_t numU = ceil((e1.length() + spacing) / spacing);
-    const int32_t numV = ceil((e2.length() + spacing) / spacing);
+    const int32_t numU      = ceil((e1.length() + spacing) / spacing);
+    const int32_t numV      = ceil((e2.length() + spacing) / spacing);
     const int32_t numVoxels = numU * numV;
 
     // Store a zero in the first element so we can do a cumulative sum later
-    outNumSamplesPerTri[eidx+1] = numVoxels;
+    outNumSamplesPerTri[eidx + 1] = numVoxels;
     if (eidx == 0) {
         outNumSamplesPerTri[0] = 0;
     }
 }
 
-
-
-template <typename ScalarF, typename ScalarI, template <typename T, int32_t D> typename TensorAccessor>
-__global__ void generateSurfaceSamples(const VoxelCoordTransform* transforms,
-                                       const JaggedRAcc32<ScalarF, 2> vertices,
-                                       const JaggedRAcc32<ScalarI, 2> triangles,
-                                       const TorchRAcc32<int64_t, 1> cumSamplesPerTri,
-                                       TensorAccessor<int32_t, 2> outIJK,
-                                       TensorAccessor<fvdb::JIdxType, 1> outJIdx) {
+template <typename ScalarF, typename ScalarI,
+          template <typename T, int32_t D> typename TensorAccessor>
+__global__ void
+generateSurfaceSamples(const VoxelCoordTransform        *transforms,
+                       const JaggedRAcc32<ScalarF, 2>    vertices,
+                       const JaggedRAcc32<ScalarI, 2>    triangles,
+                       const TorchRAcc32<int64_t, 1>     cumSamplesPerTri,
+                       TensorAccessor<int32_t, 2>        outIJK,
+                       TensorAccessor<fvdb::JIdxType, 1> outJIdx) {
     using Vec3F = nanovdb::math::Vec3<ScalarF>;
     using Vec3I = nanovdb::math::Vec3<ScalarI>;
 
-    const int32_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-    const int32_t numTris = cumSamplesPerTri.size(0) - 1;     // Total number of triangles in the mesh
-    const int32_t totalSamples = cumSamplesPerTri[numTris];   // Total number of ijk samples we're going to generate
+    const int32_t tid     = threadIdx.x + blockIdx.x * blockDim.x;
+    const int32_t numTris = cumSamplesPerTri.size(0) - 1; // Total number of triangles in the mesh
+    const int32_t totalSamples =
+        cumSamplesPerTri[numTris]; // Total number of ijk samples we're going to generate
 
     if (tid >= totalSamples) {
         return;
@@ -88,13 +91,13 @@ __global__ void generateSurfaceSamples(const VoxelCoordTransform* transforms,
     // startIdx and endIdx are the ranges to search for the triangle index
     int64_t triIdx = numTris / 2;
     {
-        int64_t startIdx = 0;
-        int64_t endIdx = numTris - 1;
+        int64_t startIdx  = 0;
+        int64_t endIdx    = numTris - 1;
         int64_t rangeSize = endIdx - startIdx;
-        int32_t count = 0;
+        int32_t count     = 0;
         while (count < 8192) {
             const int64_t rangeStart = cumSamplesPerTri[triIdx];
-            const int64_t rangeEnd = cumSamplesPerTri[triIdx+1];
+            const int64_t rangeEnd   = cumSamplesPerTri[triIdx + 1];
 
             if (tid < rangeStart) {
                 endIdx = triIdx;
@@ -103,7 +106,7 @@ __global__ void generateSurfaceSamples(const VoxelCoordTransform* transforms,
             } else {
                 break;
             }
-            triIdx = startIdx + (rangeSize / 2);
+            triIdx    = startIdx + (rangeSize / 2);
             rangeSize = endIdx - startIdx;
             count += 1;
         }
@@ -123,119 +126,131 @@ __global__ void generateSurfaceSamples(const VoxelCoordTransform* transforms,
     }
 
     // Compute the voxel coordinate vertices and edges of the triangle containing this sample
-    const fvdb::JIdxType triJIdx = triangles.batchIdx(triIdx);
-    const VoxelCoordTransform tx = transforms[triJIdx];
-    const TorchRAcc32<ScalarI, 2> faces = triangles.data();
-    const TorchRAcc32<ScalarF, 2> verts = vertices.data();
+    const fvdb::JIdxType          triJIdx = triangles.batchIdx(triIdx);
+    const VoxelCoordTransform     tx      = transforms[triJIdx];
+    const TorchRAcc32<ScalarI, 2> faces   = triangles.data();
+    const TorchRAcc32<ScalarF, 2> verts   = vertices.data();
 
     // Voxel space vertices of the triangle containing this sample
     const int64_t off = vertices.offsetStart(triJIdx);
-    const Vec3F v1 = tx.apply(verts[off+faces[triIdx][0]][0], verts[off+faces[triIdx][0]][1], verts[off+faces[triIdx][0]][2]);
-    const Vec3F v2 = tx.apply(verts[off+faces[triIdx][1]][0], verts[off+faces[triIdx][1]][1], verts[off+faces[triIdx][1]][2]);
-    const Vec3F v3 = tx.apply(verts[off+faces[triIdx][2]][0], verts[off+faces[triIdx][2]][1], verts[off+faces[triIdx][2]][2]);
+    const Vec3F   v1  = tx.apply(verts[off + faces[triIdx][0]][0], verts[off + faces[triIdx][0]][1],
+                                 verts[off + faces[triIdx][0]][2]);
+    const Vec3F   v2  = tx.apply(verts[off + faces[triIdx][1]][0], verts[off + faces[triIdx][1]][1],
+                                 verts[off + faces[triIdx][1]][2]);
+    const Vec3F   v3  = tx.apply(verts[off + faces[triIdx][2]][0], verts[off + faces[triIdx][2]][1],
+                                 verts[off + faces[triIdx][2]][2]);
 
     // Voxel space edges of the triangle containing this sample
-    const Vec3F e1 = v2 - v1;
-    const Vec3F e2 = v3 - v1;
-    const ScalarF spacing = sqrt(3.0) / 3.0;  // This is very conservative spacing but fine for now
+    const Vec3F   e1      = v2 - v1;
+    const Vec3F   e2      = v3 - v1;
+    const ScalarF spacing = sqrt(3.0) / 3.0; // This is very conservative spacing but fine for now
 
     // Number of points to generate per axis
     const int32_t numU = ceil((e1.length() + spacing) / spacing);
     const int32_t numV = ceil((e2.length() + spacing) / spacing);
 
     // Compute the position of this sample in the triangle (reflecting it if necessary)
-    const int64_t base = cumSamplesPerTri[triIdx];
+    const int64_t base        = cumSamplesPerTri[triIdx];
     const int64_t offsetInTri = tid - cumSamplesPerTri[triIdx];
-    const int64_t i0 = offsetInTri / numV;
-    const int64_t j0 = offsetInTri - i0 * numV;
-    ScalarF u = ScalarF(i0) / (ScalarF(max(numU - 1, 1)));
-    ScalarF v = ScalarF(j0) / (ScalarF(max(numV - 1, 1)));
+    const int64_t i0          = offsetInTri / numV;
+    const int64_t j0          = offsetInTri - i0 * numV;
+    ScalarF       u           = ScalarF(i0) / (ScalarF(max(numU - 1, 1)));
+    ScalarF       v           = ScalarF(j0) / (ScalarF(max(numV - 1, 1)));
     if (u + v >= 1.0) {
         u = 1.0 - u;
         v = 1.0 - v;
     }
-    const Vec3F sample = v1 + e1 * u + e2 * v;
-    const nanovdb::Coord ijk = sample.round();
+    const Vec3F          sample = v1 + e1 * u + e2 * v;
+    const nanovdb::Coord ijk    = sample.round();
 
     // Round the sample down to the nearest voxel and write it out
     outIJK[tid][0] = ijk[0];
     outIJK[tid][1] = ijk[1];
     outIJK[tid][2] = ijk[2];
-    outJIdx[tid] = triJIdx;
+    outJIdx[tid]   = triJIdx;
 }
 
-
-
 template <>
-JaggedTensor dispatchIJKForMesh<torch::kCUDA>(const JaggedTensor& meshVertices,
-                                              const JaggedTensor& meshFaces,
-                                              const std::vector<VoxelCoordTransform>& transforms) {
+JaggedTensor
+dispatchIJKForMesh<torch::kCUDA>(const JaggedTensor &meshVertices, const JaggedTensor &meshFaces,
+                                 const std::vector<VoxelCoordTransform> &transforms) {
     TORCH_CHECK(meshVertices.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(meshVertices.device().has_index(), "GridBatchImpl must have a valid index");
     TORCH_CHECK(meshFaces.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(meshFaces.device().has_index(), "GridBatchImpl must have a valid index");
 
-    RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(), meshVertices.device());
-    transformsDVec.setData((VoxelCoordTransform*) transforms.data(), true /* blocking */);
-    const VoxelCoordTransform* transformDevPtr = transformsDVec.devicePtr;
+    RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(),
+                                                            meshVertices.device());
+    transformsDVec.setData((VoxelCoordTransform *)transforms.data(), true /* blocking */);
+    const VoxelCoordTransform *transformDevPtr = transformsDVec.devicePtr;
 
-    const torch::TensorOptions optsI32 = torch::TensorOptions().dtype(torch::kInt32).device(meshFaces.device());
-    const torch::TensorOptions optsJIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(meshFaces.device());
+    const torch::TensorOptions optsI32 =
+        torch::TensorOptions().dtype(torch::kInt32).device(meshFaces.device());
+    const torch::TensorOptions optsJIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(meshFaces.device());
 
     return AT_DISPATCH_INTEGRAL_TYPES(meshFaces.scalar_type(), "ijkForMesh", [&]() {
         using scalar_i = scalar_t;
-        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(meshVertices.scalar_type(), "countVoxelsPerTriToCheckVertices", [&]() {
-            using scalar_f = scalar_t;
-
-            // First count the total number of samples to generate in each triangle
-            torch::Tensor samplesPerTri = torch::empty({meshFaces.jdata().size(0) + 1}, optsI32);
-            auto samplesPerTriAcc = samplesPerTri.packed_accessor32<int32_t, 1, torch::RestrictPtrTraits>();
-            auto verticesAcc = meshVertices.packed_accessor32<scalar_f, 2, torch::RestrictPtrTraits>();
-            auto facesAcc = meshFaces.packed_accessor32<scalar_i, 2, torch::RestrictPtrTraits>();
-            auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_i, 2> acc) {
-                countVoxelsPerTriToCheck<scalar_f, scalar_i>(bidx, eidx, transformDevPtr,
-                                                             verticesAcc, facesAcc,
-                                                             samplesPerTriAcc);
-            };
-            forEachJaggedElementChannelCUDA<scalar_i, 2>(1024, 1, meshFaces, cb);
-
-            // Compute the cumulative sum of the number of samples per triangle so each thread can figure out which triangle it's in
-            torch::Tensor samplesPerTriCumSum = samplesPerTri.cumsum(0);
-            auto samplesPerTriCumSumAcc = samplesPerTriCumSum.packed_accessor32<int64_t, 1, torch::RestrictPtrTraits>();
-            const int64_t totalSurfaceSamples = samplesPerTriCumSum[-1].item<int64_t>();
-
-            // Now write out the surface samples
-            const int64_t threadsPerBlock = 1024;
-            const int64_t numBlocks = GET_BLOCKS(totalSurfaceSamples, threadsPerBlock);
-            torch::Tensor outIJK = torch::empty({totalSurfaceSamples, 3}, optsI32);
-            torch::Tensor outJidx = torch::empty({totalSurfaceSamples}, optsJIdx);
-
-            if (outIJK.numel() >= 1 << 31) {
-                auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-                auto outJidxKAcc = outJidx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
-                generateSurfaceSamples<scalar_f, scalar_i, TorchRAcc64><<<numBlocks, threadsPerBlock>>>(transformDevPtr,
-                                                                                        verticesAcc, facesAcc,
-                                                                                        samplesPerTriCumSumAcc,
-                                                                                        outIJKAcc, outJidxKAcc);
-            } else {
-
-                auto outIJKAcc = outIJK.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>();
-                auto outJidxKAcc = outJidx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
-                generateSurfaceSamples<scalar_f, scalar_i, TorchRAcc32><<<numBlocks, threadsPerBlock>>>(transformDevPtr,
-                                                                                        verticesAcc, facesAcc,
-                                                                                        samplesPerTriCumSumAcc,
-                                                                                        outIJKAcc, outJidxKAcc);
-            }
-            C10_CUDA_KERNEL_LAUNCH_CHECK();
-
-            return fvdb::JaggedTensor::from_data_indices_and_list_ids(
-                outIJK, outJidx, meshFaces.jlidx(), meshFaces.num_tensors());
-        });
+        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            meshVertices.scalar_type(), "countVoxelsPerTriToCheckVertices", [&]() {
+                using scalar_f = scalar_t;
+
+                // First count the total number of samples to generate in each triangle
+                torch::Tensor samplesPerTri =
+                    torch::empty({ meshFaces.jdata().size(0) + 1 }, optsI32);
+                auto samplesPerTriAcc =
+                    samplesPerTri.packed_accessor32<int32_t, 1, torch::RestrictPtrTraits>();
+                auto verticesAcc =
+                    meshVertices.packed_accessor32<scalar_f, 2, torch::RestrictPtrTraits>();
+                auto facesAcc =
+                    meshFaces.packed_accessor32<scalar_i, 2, torch::RestrictPtrTraits>();
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_i, 2> acc) {
+                    countVoxelsPerTriToCheck<scalar_f, scalar_i>(
+                        bidx, eidx, transformDevPtr, verticesAcc, facesAcc, samplesPerTriAcc);
+                };
+                forEachJaggedElementChannelCUDA<scalar_i, 2>(1024, 1, meshFaces, cb);
+
+                // Compute the cumulative sum of the number of samples per triangle so each thread
+                // can figure out which triangle it's in
+                torch::Tensor samplesPerTriCumSum = samplesPerTri.cumsum(0);
+                auto          samplesPerTriCumSumAcc =
+                    samplesPerTriCumSum.packed_accessor32<int64_t, 1, torch::RestrictPtrTraits>();
+                const int64_t totalSurfaceSamples = samplesPerTriCumSum[-1].item<int64_t>();
+
+                // Now write out the surface samples
+                const int64_t threadsPerBlock = 1024;
+                const int64_t numBlocks       = GET_BLOCKS(totalSurfaceSamples, threadsPerBlock);
+                torch::Tensor outIJK          = torch::empty({ totalSurfaceSamples, 3 }, optsI32);
+                torch::Tensor outJidx         = torch::empty({ totalSurfaceSamples }, optsJIdx);
+
+                if (outIJK.numel() >= 1 << 31) {
+                    auto outIJKAcc =
+                        outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
+                    auto outJidxKAcc =
+                        outJidx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+                    generateSurfaceSamples<scalar_f, scalar_i, TorchRAcc64>
+                        <<<numBlocks, threadsPerBlock>>>(transformDevPtr, verticesAcc, facesAcc,
+                                                         samplesPerTriCumSumAcc, outIJKAcc,
+                                                         outJidxKAcc);
+                } else {
+                    auto outIJKAcc =
+                        outIJK.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>();
+                    auto outJidxKAcc =
+                        outJidx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+                    generateSurfaceSamples<scalar_f, scalar_i, TorchRAcc32>
+                        <<<numBlocks, threadsPerBlock>>>(transformDevPtr, verticesAcc, facesAcc,
+                                                         samplesPerTriCumSumAcc, outIJKAcc,
+                                                         outJidxKAcc);
+                }
+                C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+                return fvdb::JaggedTensor::from_data_indices_and_list_ids(
+                    outIJK, outJidx, meshFaces.jlidx(), meshFaces.num_tensors());
+            });
     });
-
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/PointsInGrid.cu b/fvdb/src/detail/ops/PointsInGrid.cu
index 4a3b00b9a9..23c77eaf0a 100644
--- a/fvdb/src/detail/ops/PointsInGrid.cu
+++ b/fvdb/src/detail/ops/PointsInGrid.cu
@@ -1,60 +1,67 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void pointsInGridCallback(int32_t bidx, int32_t eidx,
-                                             JaggedAccessor<ScalarType, 2> points,
-                                             TorchAccessor<bool, 1> outMask,
-                                             BatchGridAccessor<GridType> batchAccessor,
-                                             bool ignoreMasked) {
-    const auto* gpuGrid = batchAccessor.grid(bidx);
-    auto primalAcc = gpuGrid->getAccessor();
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-
-    const auto pointPos = points.data()[eidx];
-    const nanovdb::Coord vox = transform.apply((ScalarType) pointPos[0],
-                                               (ScalarType) pointPos[1],
-                                               (ScalarType) pointPos[2]).round();
-
-    const bool isActive = ignoreMasked ? primalAcc.isActive(vox) : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
-    outMask[eidx] = isActive;
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TorchAccessor>
+__hostdev__ inline void
+pointsInGridCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> points,
+                     TorchAccessor<bool, 1> outMask, BatchGridAccessor<GridType> batchAccessor,
+                     bool ignoreMasked) {
+    const auto                *gpuGrid   = batchAccessor.grid(bidx);
+    auto                       primalAcc = gpuGrid->getAccessor();
+    const VoxelCoordTransform &transform = batchAccessor.primalTransform(bidx);
+
+    const auto           pointPos = points.data()[eidx];
+    const nanovdb::Coord vox =
+        transform.apply((ScalarType)pointPos[0], (ScalarType)pointPos[1], (ScalarType)pointPos[2])
+            .round();
+
+    const bool isActive = ignoreMasked ? primalAcc.isActive(vox)
+                                       : primalAcc.template get<ActiveOrUnmasked<GridType>>(vox);
+    outMask[eidx]       = isActive;
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor PointsInGrid(const GridBatchImpl& batchHdl, const JaggedTensor& points, bool ignoreDisabledVoxels) {
+JaggedTensor
+PointsInGrid(const GridBatchImpl &batchHdl, const JaggedTensor &points, bool ignoreDisabledVoxels) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(points);
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
-    TORCH_CHECK(points.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") + std::to_string(points.rdim()) + " dimensions");
+    TORCH_CHECK(points.rdim() == 2,
+                std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(points.rdim()) + " dimensions");
     TORCH_CHECK(points.rsize(0) > 0, "Empty tensor (points)");
-    TORCH_CHECK(points.rsize(1) == 3, "Expected 3 dimensional points but got points.shape[1] = " + std::to_string(points.rsize(1)));
+    TORCH_CHECK(points.rsize(1) == 3, "Expected 3 dimensional points but got points.shape[1] = " +
+                                          std::to_string(points.rsize(1)));
 
-    auto opts = torch::TensorOptions().dtype(torch::kBool).device(points.device());
-    torch::Tensor outMask = torch::empty({points.rsize(0)}, opts);
+    auto          opts    = torch::TensorOptions().dtype(torch::kBool).device(points.device());
+    torch::Tensor outMask = torch::empty({ points.rsize(0) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "PointsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc        = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outMaskAccessor = tensorAccessor<DeviceTag, bool, 1>(outMask);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    pointsInGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outMaskAccessor, batchAcc, ignoreDisabledVoxels);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    pointsInGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, ignoreDisabledVoxels);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(1024, 1, points, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    pointsInGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outMaskAccessor, batchAcc, ignoreDisabledVoxels);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    pointsInGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ptsA, outMaskAccessor, batchAcc, ignoreDisabledVoxels);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, points, cb);
             }
@@ -64,23 +71,20 @@ JaggedTensor PointsInGrid(const GridBatchImpl& batchHdl, const JaggedTensor& poi
     return points.jagged_like(outMask);
 }
 
-
-
 template <>
-JaggedTensor dispatchPointsInGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                              const JaggedTensor& points,
-                                              bool ignoreMasked) {
+JaggedTensor
+dispatchPointsInGrid<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                   bool ignoreMasked) {
     return PointsInGrid<torch::kCUDA>(batchHdl, points, ignoreMasked);
 }
 
 template <>
-JaggedTensor dispatchPointsInGrid<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                             const JaggedTensor& points,
-                                             bool ignoreMasked) {
+JaggedTensor
+dispatchPointsInGrid<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                  bool ignoreMasked) {
     return PointsInGrid<torch::kCPU>(batchHdl, points, ignoreMasked);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/RayImplicitIntersection.cu b/fvdb/src/detail/ops/RayImplicitIntersection.cu
index baeac52084..8978bdc7ba 100644
--- a/fvdb/src/detail/ops/RayImplicitIntersection.cu
+++ b/fvdb/src/detail/ops/RayImplicitIntersection.cu
@@ -1,9 +1,9 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
@@ -12,146 +12,146 @@ namespace ops {
 static const int INVALID_SIGN = 10;
 
 template <typename T>
-static __forceinline__ __hostdev__ int sgn(const T& val) {
+static __forceinline__ __hostdev__ int
+sgn(const T &val) {
     if (val != val) {
         return INVALID_SIGN;
     }
     return (T(0) < val) - (val < T(0));
 }
 
-
-template <typename ScalarT, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void rayImplicitCallback(int32_t bidx, int32_t eidx,
-                                            JaggedAccessor<ScalarT, 2> raysO,
-                                            JaggedAccessor<ScalarT, 2> raysD,
-                                            JaggedAccessor<ScalarT, 1> gridScalarsJ,
-                                            BatchGridAccessor<GridType> batchAcc,
-                                            TensorAccessor<ScalarT, 1> outTimes, ScalarT eps) {
-
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAcc.grid(bidx);
-    const auto gridAcc = gpuGrid->getAccessor();
+template <typename ScalarT, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+rayImplicitCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarT, 2> raysO,
+                    JaggedAccessor<ScalarT, 2> raysD, JaggedAccessor<ScalarT, 1> gridScalarsJ,
+                    BatchGridAccessor<GridType> batchAcc, TensorAccessor<ScalarT, 1> outTimes,
+                    ScalarT eps) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAcc.grid(bidx);
+    const auto                         gridAcc = gpuGrid->getAccessor();
 
     const VoxelCoordTransform transform = batchAcc.dualTransform(bidx);
-    const nanovdb::CoordBBox dualBbox = batchAcc.dualBbox(bidx);
-
-    const auto rayO = raysO.data()[eidx];
-    const auto rayD = raysD.data()[eidx];
-    auto gridScalars = gridScalarsJ.data();
-    nanovdb::math::Ray<ScalarT> rayVox = transform.applyToRay(
-        rayO[0], rayO[1], rayO[2],
-        rayD[0], rayD[1], rayD[2]
-    );
+    const nanovdb::CoordBBox  dualBbox  = batchAcc.dualBbox(bidx);
+
+    const auto                  rayO        = raysO.data()[eidx];
+    const auto                  rayD        = raysD.data()[eidx];
+    auto                        gridScalars = gridScalarsJ.data();
+    nanovdb::math::Ray<ScalarT> rayVox =
+        transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
     if (!rayVox.clip(dualBbox)) {
         outTimes[eidx] = -1.0;
         return;
     }
 
-    int scalarSign = INVALID_SIGN; // Initialize to dummy value to check if first intersection
+    int     scalarSign = INVALID_SIGN; // Initialize to dummy value to check if first intersection
     ScalarT lastScalar = 0.0;
-    ScalarT lastTime = rayVox.t0();
-    bool found = false;
-
-    for (auto it = HDDAVoxelIterator<decltype(gridAcc), ScalarT>(rayVox, gridAcc); it.isValid(); it++) {
-        const ScalarT t0 = it->second.t0, t1 = it->second.t1;
-        const ScalarT deltaT = t1 - t0;
-        const nanovdb::Coord& ijk = it->first;
+    ScalarT lastTime   = rayVox.t0();
+    bool    found      = false;
+
+    for (auto it = HDDAVoxelIterator<decltype(gridAcc), ScalarT>(rayVox, gridAcc); it.isValid();
+         it++) {
+        const ScalarT         t0 = it->second.t0, t1 = it->second.t1;
+        const ScalarT         deltaT = t1 - t0;
+        const nanovdb::Coord &ijk    = it->first;
         if (deltaT < eps) {
             continue;
         }
 
         const int64_t voxelIndex = gridAcc.getValue(ijk) - 1;
         const ScalarT voxelValue = gridScalars[voxelIndex];
-        const ScalarT voxelTime = 0.5 * (t0 + t1);
-        const int voxelSign = sgn(voxelValue);
+        const ScalarT voxelTime  = 0.5 * (t0 + t1);
+        const int     voxelSign  = sgn(voxelValue);
 
         // Francis: This might be faster than the if below since it doesn't require a branch
-        // const bool hit = (scalarSign != INVALID_SIGN) && (voxelSign != INVALID_SIGN) && (scalarSign != voxelSign);
-        // const ScalarT s0 = lastScalar;
-        // const ScalarT s1 = voxelValue;
+        // const bool hit = (scalarSign != INVALID_SIGN) && (voxelSign != INVALID_SIGN) &&
+        // (scalarSign != voxelSign); const ScalarT s0 = lastScalar; const ScalarT s1 = voxelValue;
         // const ScalarT lam = s1 / (s1 - s0); // Linearly interpolate values along rays
         // const ScalarT hitTime = lam * lastTime + (1.0 - lam) * voxelTime;
         // outTimes[eidx] = (hit && !found) ? hitTime : outTimes[eidx];
         // found = found || hit;
 
         // sign change from a valid value to a valid value, then return
-        if (scalarSign != INVALID_SIGN && voxelSign != INVALID_SIGN && scalarSign != voxelSign) { // sign change, return
-            const ScalarT s0 = lastScalar;
-            const ScalarT s1 = voxelValue;
+        if (scalarSign != INVALID_SIGN && voxelSign != INVALID_SIGN &&
+            scalarSign != voxelSign) {          // sign change, return
+            const ScalarT s0  = lastScalar;
+            const ScalarT s1  = voxelValue;
             const ScalarT lam = s1 / (s1 - s0); // Linearly interpolate values along rays
-            outTimes[eidx] = lam * lastTime + (1.0 - lam) * voxelTime;
-            found = true;
+            outTimes[eidx]    = lam * lastTime + (1.0 - lam) * voxelTime;
+            found             = true;
             break;
         }
         scalarSign = voxelSign;
         lastScalar = voxelValue;
-        lastTime = voxelTime;
-
+        lastTime   = voxelTime;
     }
     if (!found) {
         outTimes[eidx] = -1.0;
     }
 }
 
-
-
-
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor RayImplicitIntersection(const GridBatchImpl& batchHdl,
-                                           const JaggedTensor& rayO,
-                                           const JaggedTensor& rayD,
-                                           const JaggedTensor& gridScalars,
-                                           float eps) {
-
+JaggedTensor
+RayImplicitIntersection(const GridBatchImpl &batchHdl, const JaggedTensor &rayO,
+                        const JaggedTensor &rayD, const JaggedTensor &gridScalars, float eps) {
     batchHdl.checkDevice(rayO);
     batchHdl.checkDevice(rayD);
     batchHdl.checkDevice(gridScalars);
     TORCH_CHECK_TYPE(rayO.is_floating_point(), "ray_origins must have a floating point type");
     TORCH_CHECK_TYPE(rayD.is_floating_point(), "ray_directions must have a floating point type");
-    TORCH_CHECK_TYPE(gridScalars.is_floating_point(), "gridScalars must have a floating point type");
+    TORCH_CHECK_TYPE(gridScalars.is_floating_point(),
+                     "gridScalars must have a floating point type");
 
     TORCH_CHECK_TYPE(rayO.dtype() == rayD.dtype(), "all tensors must have the same type");
     TORCH_CHECK_TYPE(rayD.dtype() == gridScalars.dtype(), "all tensors must have the same type");
 
-    TORCH_CHECK(rayO.rdim() == 2, std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
-                                 std::to_string(rayO.rdim()) + " dimensions");
-    TORCH_CHECK(rayD.rdim() == 2, std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
-                                 std::to_string(rayD.rdim()) + " dimensions");
+    TORCH_CHECK(rayO.rdim() == 2,
+                std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(rayO.rdim()) + " dimensions");
+    TORCH_CHECK(
+        rayD.rdim() == 2,
+        std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayD.rdim()) + " dimensions");
 
-    TORCH_CHECK(rayD.rsize(0) == rayO.rsize(0), std::string("Expected ray_origins and ray_directions to have the same shape "));
+    TORCH_CHECK(rayD.rsize(0) == rayO.rsize(0),
+                std::string("Expected ray_origins and ray_directions to have the same shape "));
     TORCH_CHECK(rayO.rsize(1) == 3, std::string("Expected ray_origins to have shape (n, 3)"));
     TORCH_CHECK(rayD.rsize(1) == 3, std::string("Expected ray_directions to have shape (n, 3)"));
 
-    TORCH_CHECK(gridScalars.rdim() == 1, std::string("Expected grid_scalars to have 1 dimension (shape (num_voxels,)) but got ") +
-                                        std::to_string(gridScalars.rdim()) + " dimensions");
-    TORCH_CHECK(gridScalars.rsize(0) == batchHdl.totalVoxels(), std::string("Expected one scalar per voxel but got ") +
-                                                               std::to_string(gridScalars.rsize(0)) + " scalars and there are " +
-                                                               std::to_string(batchHdl.totalVoxels()) + " voxels.");
+    TORCH_CHECK(
+        gridScalars.rdim() == 1,
+        std::string("Expected grid_scalars to have 1 dimension (shape (num_voxels,)) but got ") +
+            std::to_string(gridScalars.rdim()) + " dimensions");
+    TORCH_CHECK(gridScalars.rsize(0) == batchHdl.totalVoxels(),
+                std::string("Expected one scalar per voxel but got ") +
+                    std::to_string(gridScalars.rsize(0)) + " scalars and there are " +
+                    std::to_string(batchHdl.totalVoxels()) + " voxels.");
 
-    auto optsF = torch::TensorOptions().dtype(rayO.dtype()).device(rayO.device());
-    torch::Tensor outTimes = torch::empty({rayO.rsize(0)}, optsF);
+    auto          optsF    = torch::TensorOptions().dtype(rayO.dtype()).device(rayO.device());
+    torch::Tensor outTimes = torch::empty({ rayO.rsize(0) }, optsF);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES(rayO.scalar_type(), "RayImplicitIntersection", [&]() {
-
             int64_t numThreads = 256 + 128;
             if constexpr (nanovdb::util::is_same<scalar_t, double>::value) {
                 numThreads = 256;
             }
 
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto rayDAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayD);
+            auto batchAcc       = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto rayDAcc        = jaggedAccessor<DeviceTag, scalar_t, 2>(rayD);
             auto gridScalarsAcc = jaggedAccessor<DeviceTag, scalar_t, 1>(gridScalars);
-            auto outTimesAcc = tensorAccessor<DeviceTag, scalar_t, 1>(outTimes);
+            auto outTimesAcc    = tensorAccessor<DeviceTag, scalar_t, 1>(outTimes);
 
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rOA) {
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> rOA) {
                     rayImplicitCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
                         bidx, eidx, rOA, rayDAcc, gridScalarsAcc, batchAcc, outTimesAcc, eps);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayO, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rOA) {
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> rOA) {
                     rayImplicitCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
                         bidx, eidx, rOA, rayDAcc, gridScalarsAcc, batchAcc, outTimesAcc, eps);
                 };
@@ -163,33 +163,26 @@ JaggedTensor RayImplicitIntersection(const GridBatchImpl& batchHdl,
     return rayO.jagged_like(outTimes);
 }
 
-
 template <>
-JaggedTensor dispatchRayImplicitIntersection<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                         const JaggedTensor& rayOrigins,
-                                                         const JaggedTensor& rayDirections,
-                                                         const JaggedTensor& gridScalars,
-                                                         float eps) {
-    return RayImplicitIntersection<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, gridScalars, eps);
+JaggedTensor
+dispatchRayImplicitIntersection<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                              const JaggedTensor  &rayOrigins,
+                                              const JaggedTensor  &rayDirections,
+                                              const JaggedTensor &gridScalars, float eps) {
+    return RayImplicitIntersection<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, gridScalars,
+                                                 eps);
 }
 
-
 template <>
-JaggedTensor dispatchRayImplicitIntersection<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                        const JaggedTensor& rayOrigins,
-                                                        const JaggedTensor& rayDirections,
-                                                        const JaggedTensor& gridScalars,
-                                                        float eps) {
-    return RayImplicitIntersection<torch::kCPU>(batchHdl, rayOrigins, rayDirections, gridScalars, eps);
+JaggedTensor
+dispatchRayImplicitIntersection<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                             const JaggedTensor  &rayOrigins,
+                                             const JaggedTensor  &rayDirections,
+                                             const JaggedTensor &gridScalars, float eps) {
+    return RayImplicitIntersection<torch::kCPU>(batchHdl, rayOrigins, rayDirections, gridScalars,
+                                                eps);
 }
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
-
-
-
-
-
-
diff --git a/fvdb/src/detail/ops/ReadFromDense.cu b/fvdb/src/detail/ops/ReadFromDense.cu
index 329c0fba0a..a922f81566 100644
--- a/fvdb/src/detail/ops/ReadFromDense.cu
+++ b/fvdb/src/detail/ops/ReadFromDense.cu
@@ -1,71 +1,78 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename GridType, typename ScalarType>
-__hostdev__ inline void readFromDenseVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                   GridBatchImpl::Accessor<GridType> batchHandle,
-                                                   torch::PackedTensorAccessor64<ScalarType, 5, torch::RestrictPtrTraits> inDenseTensor,   // [B, W, H, D, C]
-                                                   torch::PackedTensorAccessor64<int32_t, 2, torch::RestrictPtrTraits> denseOrigins,       // [B, 3]
-                                                   torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> outSparseTensor, // [B*N, C]
-                                                   bool ignoreMasked) {
-
+__hostdev__ inline void
+readFromDenseVoxelCallback(
+    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+    GridBatchImpl::Accessor<GridType> batchHandle,
+    torch::PackedTensorAccessor64<ScalarType, 5, torch::RestrictPtrTraits>
+        inDenseTensor, // [B, W, H, D, C]
+    torch::PackedTensorAccessor64<int32_t, 2, torch::RestrictPtrTraits> denseOrigins, // [B, 3]
+    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits>
+         outSparseTensor,                                                             // [B*N, C]
+    bool ignoreMasked) {
     using LeafNodeT = typename nanovdb::NanoGrid<GridType>::LeafNodeType;
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchHandle.grid(batchIdx);
-    const nanovdb::Coord denseDim(inDenseTensor.size(1), inDenseTensor.size(2), inDenseTensor.size(3));
-    const nanovdb::Coord denseOrigin(denseOrigins[batchIdx][0], denseOrigins[batchIdx][1], denseOrigins[batchIdx][2]);
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchHandle.grid(batchIdx);
+    const nanovdb::Coord               denseDim(inDenseTensor.size(1), inDenseTensor.size(2),
+                                                inDenseTensor.size(3));
+    const nanovdb::Coord     denseOrigin(denseOrigins[batchIdx][0], denseOrigins[batchIdx][1],
+                                         denseOrigins[batchIdx][2]);
     const nanovdb::CoordBBox bbox(denseOrigin, denseOrigin + denseDim - nanovdb::Coord(1, 1, 1));
-    const int64_t baseOffset = batchHandle.voxelOffset(batchIdx);
+    const int64_t            baseOffset = batchHandle.voxelOffset(batchIdx);
 
-    const LeafNodeT& leaf = gpuGrid->tree().template getFirstNode<0>()[leafIdx];
+    const LeafNodeT     &leaf   = gpuGrid->tree().template getFirstNode<0>()[leafIdx];
     const nanovdb::Coord voxIjk = leaf.offsetToGlobalCoord(voxelIdx);
 
-    const bool isActive = ignoreMasked ? leaf.isActive(voxelIdx) : leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
+    const bool isActive = ignoreMasked ? leaf.isActive(voxelIdx)
+                                       : leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
 
-    const nanovdb::Coord ijk = voxIjk - denseOrigin;
-    const int64_t offset = baseOffset + leaf.getValue(voxelIdx) - 1;
+    const nanovdb::Coord ijk    = voxIjk - denseOrigin;
+    const int64_t        offset = baseOffset + leaf.getValue(voxelIdx) - 1;
 
     if (isActive && bbox.isInside(voxIjk)) {
-         outSparseTensor[offset][channelIdx] = inDenseTensor[batchIdx][ijk[0]][ijk[1]][ijk[2]][channelIdx];
+        outSparseTensor[offset][channelIdx] =
+            inDenseTensor[batchIdx][ijk[0]][ijk[1]][ijk[2]][channelIdx];
     }
 }
 
-
 template <typename GridType, typename ScalarType>
-void readFromDenseCPU(const GridBatchImpl::Accessor<GridType>& gridHandle,
-                      const torch::TensorAccessor<ScalarType, 5> inDenseTensor,
-                      const  torch::TensorAccessor<int32_t, 2> denseOrigins,
-                      torch::TensorAccessor<ScalarType, 2> outSparseTensor,
-                      bool ignoreMasked,
-                      bool isContiguous) {
-
-
+void
+readFromDenseCPU(const GridBatchImpl::Accessor<GridType>   &gridHandle,
+                 const torch::TensorAccessor<ScalarType, 5> inDenseTensor,
+                 const torch::TensorAccessor<int32_t, 2>    denseOrigins,
+                 torch::TensorAccessor<ScalarType, 2> outSparseTensor, bool ignoreMasked,
+                 bool isContiguous) {
     for (size_t bi = 0; bi < gridHandle.batchSize(); bi += 1) {
-        const nanovdb::NanoGrid<GridType>* grid = gridHandle.grid(bi);
-        const nanovdb::Coord denseOrigin(denseOrigins[bi][0], denseOrigins[bi][1], denseOrigins[bi][2]);
-        const nanovdb::Coord denseDim(inDenseTensor.size(1), inDenseTensor.size(2), inDenseTensor.size(3));
-        const nanovdb::CoordBBox bbox(denseOrigin, denseOrigin + denseDim - nanovdb::Coord(1, 1, 1));
-        const int64_t baseOffset = gridHandle.voxelOffset(bi);
-        auto inBatch = inDenseTensor[bi];
-
-        for (auto it = ActiveVoxelIterator<GridType, -1>(grid->tree(), ignoreMasked, baseOffset); it.isValid(); it++) {
+        const nanovdb::NanoGrid<GridType> *grid = gridHandle.grid(bi);
+        const nanovdb::Coord               denseOrigin(denseOrigins[bi][0], denseOrigins[bi][1],
+                                                       denseOrigins[bi][2]);
+        const nanovdb::Coord               denseDim(inDenseTensor.size(1), inDenseTensor.size(2),
+                                                    inDenseTensor.size(3));
+        const nanovdb::CoordBBox           bbox(denseOrigin,
+                                                denseOrigin + denseDim - nanovdb::Coord(1, 1, 1));
+        const int64_t                      baseOffset = gridHandle.voxelOffset(bi);
+        auto                               inBatch    = inDenseTensor[bi];
+
+        for (auto it = ActiveVoxelIterator<GridType, -1>(grid->tree(), ignoreMasked, baseOffset);
+             it.isValid(); it++) {
             const nanovdb::Coord voxIjk = it->first;
             if (bbox.isInside(voxIjk)) {
                 const nanovdb::Coord ijk = voxIjk - denseOrigin;
                 if (isContiguous) {
                     memcpy(outSparseTensor[it->second].data(),
-                        inBatch[ijk[0]][ijk[1]][ijk[2]].data(),
-                        outSparseTensor.size(1) * sizeof(ScalarType));
+                           inBatch[ijk[0]][ijk[1]][ijk[2]].data(),
+                           outSparseTensor.size(1) * sizeof(ScalarType));
                 } else {
                     for (int c = 0; c < outSparseTensor.size(1); ++c) {
                         outSparseTensor[it->second][c] = inBatch[ijk[0]][ijk[1]][ijk[2]][c];
@@ -76,55 +83,53 @@ void readFromDenseCPU(const GridBatchImpl::Accessor<GridType>& gridHandle,
     }
 }
 
-
 template <>
-void dispatchReadFromDense<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                       const torch::Tensor& inDenseTensor,
-                                       const torch::Tensor& denseOrigins,
-                                       torch::Tensor& outSparseTensor,
-                                       bool ignoreMasked) {
-
+void
+dispatchReadFromDense<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                    const torch::Tensor &inDenseTensor,
+                                    const torch::Tensor &denseOrigins,
+                                    torch::Tensor &outSparseTensor, bool ignoreMasked) {
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          inDenseTensor.scalar_type(), "readFromDense", [&]() {
-            auto inDenseAcc = inDenseTensor.packed_accessor64<scalar_t, 5, torch::RestrictPtrTraits>();
-            auto denseOriginsAcc = denseOrigins.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-            auto outSparseAcc = outSparseTensor.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
-            auto callback = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> batchAcc) {
-                readFromDenseVoxelCallback<GridType, scalar_t>(bidx, lidx, vidx, cidx, batchAcc, inDenseAcc, denseOriginsAcc, outSparseAcc, ignoreMasked);
-            };
-            forEachVoxelCUDA<GridType>(1024, outSparseTensor.size(1), batchHdl, callback);
-
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, inDenseTensor.scalar_type(),
+            "readFromDense", [&]() {
+                auto inDenseAcc =
+                    inDenseTensor.packed_accessor64<scalar_t, 5, torch::RestrictPtrTraits>();
+                auto denseOriginsAcc =
+                    denseOrigins.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
+                auto outSparseAcc =
+                    outSparseTensor.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
+                auto callback = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx,
+                                               int32_t                           cidx,
+                                               GridBatchImpl::Accessor<GridType> batchAcc) {
+                    readFromDenseVoxelCallback<GridType, scalar_t>(bidx, lidx, vidx, cidx, batchAcc,
+                                                                   inDenseAcc, denseOriginsAcc,
+                                                                   outSparseAcc, ignoreMasked);
+                };
+                forEachVoxelCUDA<GridType>(1024, outSparseTensor.size(1), batchHdl, callback);
+            });
     });
-
 }
 
-
 template <>
-void dispatchReadFromDense<torch::kCPU>(const GridBatchImpl& gridHdl,
-                                      const torch::Tensor& inDenseTensor,
-                                      const torch::Tensor& denseOrigins,
-                                      torch::Tensor& outSparseTensor,
-                                      bool ignoreMasked) {
-
+void
+dispatchReadFromDense<torch::kCPU>(const GridBatchImpl &gridHdl, const torch::Tensor &inDenseTensor,
+                                   const torch::Tensor &denseOrigins,
+                                   torch::Tensor &outSparseTensor, bool ignoreMasked) {
     bool isContiguous = inDenseTensor.is_contiguous() && outSparseTensor.is_contiguous();
 
     FVDB_DISPATCH_GRID_TYPES(gridHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          inDenseTensor.scalar_type(), "readFromDense", [&]() {
-            readFromDenseCPU(gridHdl.hostAccessor<GridType>(),
-                             inDenseTensor.accessor<scalar_t, 5>(),
-                             denseOrigins.accessor<int32_t, 2>(),
-                             outSparseTensor.accessor<scalar_t, 2>(),
-                             ignoreMasked, isContiguous);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, inDenseTensor.scalar_type(),
+            "readFromDense", [&]() {
+                readFromDenseCPU(
+                    gridHdl.hostAccessor<GridType>(), inDenseTensor.accessor<scalar_t, 5>(),
+                    denseOrigins.accessor<int32_t, 2>(), outSparseTensor.accessor<scalar_t, 2>(),
+                    ignoreMasked, isContiguous);
+            });
     });
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/ReadIntoDense.cu b/fvdb/src/detail/ops/ReadIntoDense.cu
index 04c6aafe93..ba886b2c73 100644
--- a/fvdb/src/detail/ops/ReadIntoDense.cu
+++ b/fvdb/src/detail/ops/ReadIntoDense.cu
@@ -1,71 +1,77 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include <detail/utils/cuda/Utils.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename GridType, typename ScalarType>
-__hostdev__ inline void readIntoDenseVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                   GridBatchImpl::Accessor<GridType> batchHandle,
-                                                   torch::PackedTensorAccessor64<int32_t, 2, torch::RestrictPtrTraits> denseOrigins,        // [B, 3]
-                                                   torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits> inSparseTensor,   // [B*N, C]
-                                                   torch::PackedTensorAccessor64<ScalarType, 5, torch::RestrictPtrTraits> outDenseTensor,   // [B, W, H, D, C]
-                                                   bool ignoreMasked) {
-
+__hostdev__ inline void
+readIntoDenseVoxelCallback(
+    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
+    GridBatchImpl::Accessor<GridType>                                   batchHandle,
+    torch::PackedTensorAccessor64<int32_t, 2, torch::RestrictPtrTraits> denseOrigins, // [B, 3]
+    torch::PackedTensorAccessor64<ScalarType, 2, torch::RestrictPtrTraits>
+        inSparseTensor,                                                               // [B*N, C]
+    torch::PackedTensorAccessor64<ScalarType, 5, torch::RestrictPtrTraits>
+         outDenseTensor, // [B, W, H, D, C]
+    bool ignoreMasked) {
     using LeafNodeT = typename nanovdb::NanoGrid<GridType>::LeafNodeType;
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchHandle.grid(batchIdx);
-    const nanovdb::Coord denseDim(outDenseTensor.size(1), outDenseTensor.size(2), outDenseTensor.size(3));
-    const nanovdb::Coord denseOrigin(denseOrigins[batchIdx][0], denseOrigins[batchIdx][1], denseOrigins[batchIdx][2]);
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchHandle.grid(batchIdx);
+    const nanovdb::Coord               denseDim(outDenseTensor.size(1), outDenseTensor.size(2),
+                                                outDenseTensor.size(3));
+    const nanovdb::Coord     denseOrigin(denseOrigins[batchIdx][0], denseOrigins[batchIdx][1],
+                                         denseOrigins[batchIdx][2]);
     const nanovdb::CoordBBox bbox(denseOrigin, denseOrigin + denseDim - nanovdb::Coord(1, 1, 1));
-    const int64_t baseOffset = batchHandle.voxelOffset(batchIdx);
+    const int64_t            baseOffset = batchHandle.voxelOffset(batchIdx);
 
-    const LeafNodeT& leaf = gpuGrid->tree().template getFirstNode<0>()[leafIdx];
+    const LeafNodeT     &leaf   = gpuGrid->tree().template getFirstNode<0>()[leafIdx];
     const nanovdb::Coord voxIjk = leaf.offsetToGlobalCoord(voxelIdx);
 
-    const bool isActive = ignoreMasked ? leaf.isActive(voxelIdx) : leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
+    const bool isActive = ignoreMasked ? leaf.isActive(voxelIdx)
+                                       : leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx);
 
-    const nanovdb::Coord ijk = voxIjk - denseOrigin;
-    const int64_t offset = baseOffset + leaf.getValue(voxelIdx) - 1;
+    const nanovdb::Coord ijk    = voxIjk - denseOrigin;
+    const int64_t        offset = baseOffset + leaf.getValue(voxelIdx) - 1;
 
     if (isActive && bbox.isInside(voxIjk)) {
-        outDenseTensor[batchIdx][ijk[0]][ijk[1]][ijk[2]][channelIdx] = inSparseTensor[offset][channelIdx];
+        outDenseTensor[batchIdx][ijk[0]][ijk[1]][ijk[2]][channelIdx] =
+            inSparseTensor[offset][channelIdx];
     }
 }
 
-
 template <typename GridType, typename ScalarType>
-void readIntoDenseCPU(const GridBatchImpl::Accessor<GridType>& gridHandle,
-                      const torch::TensorAccessor<ScalarType, 2> inGridData,
-                      const torch::TensorAccessor<int32_t, 2> denseOrigins,
-                      torch::TensorAccessor<ScalarType, 5> outDenseTensor,
-                      bool ignoreMasked,
-                      bool isContiguous) {
-
+void
+readIntoDenseCPU(const GridBatchImpl::Accessor<GridType>   &gridHandle,
+                 const torch::TensorAccessor<ScalarType, 2> inGridData,
+                 const torch::TensorAccessor<int32_t, 2>    denseOrigins,
+                 torch::TensorAccessor<ScalarType, 5> outDenseTensor, bool ignoreMasked,
+                 bool isContiguous) {
     for (size_t bi = 0; bi < gridHandle.batchSize(); bi += 1) {
-        const nanovdb::NanoGrid<GridType>* grid = gridHandle.grid(bi);
+        const nanovdb::NanoGrid<GridType> *grid = gridHandle.grid(bi);
 
         const nanovdb::Coord bbmin(denseOrigins[bi][0], denseOrigins[bi][1], denseOrigins[bi][2]);
-        const nanovdb::Coord bbsize(outDenseTensor.size(1), outDenseTensor.size(2), outDenseTensor.size(3));
+        const nanovdb::Coord bbsize(outDenseTensor.size(1), outDenseTensor.size(2),
+                                    outDenseTensor.size(3));
         const nanovdb::CoordBBox bbox(bbmin, bbmin + bbsize - nanovdb::Coord(1, 1, 1));
-        const int64_t baseOffset = gridHandle.voxelOffset(bi);
+        const int64_t            baseOffset = gridHandle.voxelOffset(bi);
 
         auto outBatch = outDenseTensor[bi];
 
-        for (auto it = ActiveVoxelIterator<GridType, -1>(grid->tree(), ignoreMasked, baseOffset); it.isValid(); it++) {
+        for (auto it = ActiveVoxelIterator<GridType, -1>(grid->tree(), ignoreMasked, baseOffset);
+             it.isValid(); it++) {
             const nanovdb::Coord voxIjk = it->first;
             if (bbox.isInside(voxIjk)) {
                 const nanovdb::Coord ijk = voxIjk - bbox.min();
 
                 if (isContiguous) {
-                    memcpy(outBatch[ijk[0]][ijk[1]][ijk[2]].data(),
-                           inGridData[it->second].data(),
+                    memcpy(outBatch[ijk[0]][ijk[1]][ijk[2]].data(), inGridData[it->second].data(),
                            inGridData.size(1) * sizeof(ScalarType));
                 } else {
                     for (int c = 0; c < inGridData.size(1); ++c) {
@@ -77,50 +83,49 @@ void readIntoDenseCPU(const GridBatchImpl::Accessor<GridType>& gridHandle,
     }
 }
 
-
 template <>
-void dispatchReadIntoDense<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                       const torch::Tensor& inGridData,
-                                       const torch::Tensor& denseOrigins,
-                                       torch::Tensor& outDenseTensor,
-                                       bool ignoreMasked) {
-
+void
+dispatchReadIntoDense<torch::kCUDA>(const GridBatchImpl &batchHdl, const torch::Tensor &inGridData,
+                                    const torch::Tensor &denseOrigins,
+                                    torch::Tensor &outDenseTensor, bool ignoreMasked) {
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          outDenseTensor.scalar_type(), "readIntoDense", [&]() {
-            auto outDenseAcc = outDenseTensor.packed_accessor64<scalar_t, 5, torch::RestrictPtrTraits>();
-            auto denseOriginsAcc = denseOrigins.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-            auto inGridDataAcc = inGridData.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
-            auto callback = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> batchAcc) {
-                readIntoDenseVoxelCallback<GridType, scalar_t>(bidx, lidx, vidx, cidx, batchAcc, denseOriginsAcc, inGridDataAcc, outDenseAcc, ignoreMasked);
-            };
-            forEachVoxelCUDA<GridType>(1024, inGridData.size(1), batchHdl, callback);
-
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, outDenseTensor.scalar_type(),
+            "readIntoDense", [&]() {
+                auto outDenseAcc =
+                    outDenseTensor.packed_accessor64<scalar_t, 5, torch::RestrictPtrTraits>();
+                auto denseOriginsAcc =
+                    denseOrigins.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
+                auto inGridDataAcc =
+                    inGridData.packed_accessor64<scalar_t, 2, torch::RestrictPtrTraits>();
+                auto callback = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx,
+                                               int32_t                           cidx,
+                                               GridBatchImpl::Accessor<GridType> batchAcc) {
+                    readIntoDenseVoxelCallback<GridType, scalar_t>(bidx, lidx, vidx, cidx, batchAcc,
+                                                                   denseOriginsAcc, inGridDataAcc,
+                                                                   outDenseAcc, ignoreMasked);
+                };
+                forEachVoxelCUDA<GridType>(1024, inGridData.size(1), batchHdl, callback);
+            });
     });
-
 }
 
-
 template <>
-void dispatchReadIntoDense<torch::kCPU>(const GridBatchImpl& gridHdl,
-                                      const torch::Tensor& inGridData,
-                                      const torch::Tensor& denseOrigins,
-                                      torch::Tensor& outDenseTensor,
-                                      bool ignoreMasked) {
+void
+dispatchReadIntoDense<torch::kCPU>(const GridBatchImpl &gridHdl, const torch::Tensor &inGridData,
+                                   const torch::Tensor &denseOrigins, torch::Tensor &outDenseTensor,
+                                   bool ignoreMasked) {
     bool isContiguous = inGridData.is_contiguous() && outDenseTensor.is_contiguous();
 
     FVDB_DISPATCH_GRID_TYPES(gridHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          outDenseTensor.scalar_type(), "readIntoDense", [&]() {
-            readIntoDenseCPU(gridHdl.hostAccessor<GridType>(),
-                             inGridData.accessor<scalar_t, 2>(),
-                             denseOrigins.accessor<int32_t, 2>(),
-                             outDenseTensor.accessor<scalar_t, 5>(),
-                             ignoreMasked, isContiguous);
-        });
+        AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16,
+                                        outDenseTensor.scalar_type(), "readIntoDense", [&]() {
+                                            readIntoDenseCPU(gridHdl.hostAccessor<GridType>(),
+                                                             inGridData.accessor<scalar_t, 2>(),
+                                                             denseOrigins.accessor<int32_t, 2>(),
+                                                             outDenseTensor.accessor<scalar_t, 5>(),
+                                                             ignoreMasked, isContiguous);
+                                        });
     });
 }
 
diff --git a/fvdb/src/detail/ops/SampleGridBezier.cu b/fvdb/src/detail/ops/SampleGridBezier.cu
index 7819ceb1a3..e7f93aef79 100644
--- a/fvdb/src/detail/ops/SampleGridBezier.cu
+++ b/fvdb/src/detail/ops/SampleGridBezier.cu
@@ -1,98 +1,105 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/BezierInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/BezierInterpolationIterator.h"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleBezierCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                      JaggedAccessor<ScalarType, 2> points,
-                                      TensorAccessor<ScalarType, 2> gridData,
-                                      BatchGridAccessor<GridType> batchAccessor,
-                                      TensorAccessor<ScalarType, 2> outFeatures) {
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleBezierCallback(int32_t bidx, int32_t eidx, int32_t cidx, JaggedAccessor<ScalarType, 2> points,
+                     TensorAccessor<ScalarType, 2> gridData,
+                     BatchGridAccessor<GridType>   batchAccessor,
+                     TensorAccessor<ScalarType, 2> outFeatures) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto& pointsData = points.data();
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const auto                        &pointsData = points.data();
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->tree().getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
 
     for (auto it = BezierInterpolationIterator<MathType>(xyz); it.isValid(); ++it) {
-        const nanovdb::Coord ijk = it->first;
-        const bool isActive = gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk);
-        const MathType wBezier = it->second;
-        const int64_t indexIjk = gridAcc.getValue(ijk) - 1 + baseOffset;
+        const nanovdb::Coord ijk      = it->first;
+        const bool           isActive = gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk);
+        const MathType       wBezier  = it->second;
+        const int64_t        indexIjk = gridAcc.getValue(ijk) - 1 + baseOffset;
         if (isActive) {
             outFeatures[eidx][cidx] += wBezier * gridData[indexIjk][cidx];
         }
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-std::vector<torch::Tensor> SampleGridBezier(const GridBatchImpl& batchHdl,
-                                            const JaggedTensor& points,
-                                            const torch::Tensor& gridData) {
-
-    auto opts = torch::TensorOptions().dtype(gridData.dtype()).device(gridData.device()).requires_grad(gridData.requires_grad());
-    torch::Tensor gridDataReshape = featureCoalescedView(gridData);                   // [N, -1]
-    torch::Tensor outFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1)}, opts);  // [B*M, -1]
-    auto outShape = spliceShape({points.rsize(0)}, gridData, 1);  // [B*M, *]
+std::vector<torch::Tensor>
+SampleGridBezier(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                 const torch::Tensor &gridData) {
+    auto opts = torch::TensorOptions()
+                    .dtype(gridData.dtype())
+                    .device(gridData.device())
+                    .requires_grad(gridData.requires_grad());
+    torch::Tensor gridDataReshape = featureCoalescedView(gridData);       // [N, -1]
+    torch::Tensor outFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1) }, opts); // [B*M, -1]
+    auto outShape = spliceShape({ points.rsize(0) }, gridData, 1);        // [B*M, *]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridBezier", ([&] {
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto gridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
-            auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleBezierCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridDataReshape.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleBezierCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(gridDataReshape.size(1), points, cb);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridBezier", ([&] {
+                auto batchAcc       = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto gridDataAcc    = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
+                auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleBezierCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridDataReshape.size(1),
+                                                                 points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleBezierCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(gridDataReshape.size(1), points,
+                                                                cb);
+                }
+            }));
     });
 
-    return {outFeatures.reshape(outShape)};
+    return { outFeatures.reshape(outShape) };
 }
 
-
-
 template <>
-std::vector<torch::Tensor> dispatchSampleGridBezier<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                const JaggedTensor& points,
-                                                                const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridBezier<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                       const torch::Tensor &gridData) {
     return SampleGridBezier<torch::kCUDA>(batchHdl, points, gridData);
 }
 
 template <>
-std::vector<torch::Tensor> dispatchSampleGridBezier<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                               const JaggedTensor& points,
-                                                               const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridBezier<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                      const torch::Tensor &gridData) {
     return SampleGridBezier<torch::kCPU>(batchHdl, points, gridData);
 }
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/SampleGridBezierWithGrad.cu b/fvdb/src/detail/ops/SampleGridBezierWithGrad.cu
index b453e898c9..5082bfba1a 100644
--- a/fvdb/src/detail/ops/SampleGridBezierWithGrad.cu
+++ b/fvdb/src/detail/ops/SampleGridBezierWithGrad.cu
@@ -1,112 +1,125 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/BezierInterpolationWithGradIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/BezierInterpolationWithGradIterator.h"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleBezierWithGradCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                              JaggedAccessor<ScalarType, 2> points,
-                                              TensorAccessor<ScalarType, 2> gridData,
-                                              BatchGridAccessor<GridType> batchAccessor,
-                                              TensorAccessor<ScalarType, 2> outFeatures,
-                                              TensorAccessor<ScalarType, 3> outGradFeatures) {
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleBezierWithGradCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                             JaggedAccessor<ScalarType, 2> points,
+                             TensorAccessor<ScalarType, 2> gridData,
+                             BatchGridAccessor<GridType>   batchAccessor,
+                             TensorAccessor<ScalarType, 2> outFeatures,
+                             TensorAccessor<ScalarType, 3> outGradFeatures) {
     using MathType = at::opmath_type<ScalarType>;
 
     auto pointsData = points.data();
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->tree().getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
 
     auto gradTransform = transform.template applyGrad<MathType>(xyz);
 
     for (auto it = BezierInterpolationWithGradIterator<MathType>(xyz); it.isValid(); ++it) {
-        const nanovdb::Coord ijk = it->first;
+        const nanovdb::Coord                ijk  = it->first;
         const nanovdb::math::Vec4<MathType> wXYZ = it->second;
-        const bool isActive = gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk);
+        const bool    isActive = gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk);
         const int64_t indexIjk = gridAcc.getValue(ijk) - 1 + baseOffset;
         if (isActive) {
             outFeatures[eidx][cidx] += wXYZ[0] * gridData[indexIjk][cidx];
-            #pragma unroll
-            for (int dim = 0; dim < 3; ++ dim) {
-                outGradFeatures[eidx][cidx][dim] += wXYZ[dim + 1] * gridData[indexIjk][cidx] * gradTransform[dim];
+#pragma unroll
+            for (int dim = 0; dim < 3; ++dim) {
+                outGradFeatures[eidx][cidx][dim] +=
+                    wXYZ[dim + 1] * gridData[indexIjk][cidx] * gradTransform[dim];
             }
         }
     }
 }
 
 template <c10::DeviceType DeviceTag>
-std::vector<torch::Tensor> SampleGridBezierWithGrad(const GridBatchImpl& batchHdl,
-                                                    const JaggedTensor& points,
-                                                    const torch::Tensor& gridData) {
-    auto opts = torch::TensorOptions().dtype(gridData.dtype()).device(gridData.device()).requires_grad(gridData.requires_grad());
-    torch::Tensor gridDataReshape = featureCoalescedView(gridData);                           // [N, -1]
-    torch::Tensor outFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1)}, opts);  // [B*M, -1]
-    torch::Tensor outGradFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1), 3}, opts);  // [B*M, -1, 3]
-    auto outShape = spliceShape({points.rsize(0)}, gridData, 1);  // [B*M, *]
-    auto gradOutShape = outShape; gradOutShape.push_back(3);  // [B*M, *, 3]
+std::vector<torch::Tensor>
+SampleGridBezierWithGrad(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                         const torch::Tensor &gridData) {
+    auto opts = torch::TensorOptions()
+                    .dtype(gridData.dtype())
+                    .device(gridData.device())
+                    .requires_grad(gridData.requires_grad());
+    torch::Tensor gridDataReshape = featureCoalescedView(gridData);          // [N, -1]
+    torch::Tensor outFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1) }, opts);    // [B*M, -1]
+    torch::Tensor outGradFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1), 3 }, opts); // [B*M, -1, 3]
+    auto outShape     = spliceShape({ points.rsize(0) }, gridData, 1);       // [B*M, *]
+    auto gradOutShape = outShape;
+    gradOutShape.push_back(3);                                               // [B*M, *, 3]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridBezierWithGrad", ([&] {
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-
-            auto gridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
-            auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
-            auto outGradFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(outGradFeatures);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleBezierWithGradCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc, outGradFeaturesAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridDataReshape.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleBezierWithGradCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc, outGradFeaturesAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(gridDataReshape.size(1), points, cb);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridBezierWithGrad", ([&] {
+                auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+
+                auto gridDataAcc        = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
+                auto outFeaturesAcc     = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
+                auto outGradFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(outGradFeatures);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleBezierWithGradCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc,
+                            outGradFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridDataReshape.size(1),
+                                                                 points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleBezierWithGradCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc,
+                            outGradFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(gridDataReshape.size(1), points,
+                                                                cb);
+                }
+            }));
     });
 
-    return {
-        outFeatures.reshape(outShape),
-        outGradFeatures.reshape(gradOutShape)
-    };
+    return { outFeatures.reshape(outShape), outGradFeatures.reshape(gradOutShape) };
 }
 
-
 template <>
-std::vector<torch::Tensor> dispatchSampleGridBezierWithGrad<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                        const JaggedTensor& points,
-                                                                        const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridBezierWithGrad<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                               const JaggedTensor  &points,
+                                               const torch::Tensor &gridData) {
     return SampleGridBezierWithGrad<torch::kCUDA>(batchHdl, points, gridData);
 }
 
 template <>
-std::vector<torch::Tensor> dispatchSampleGridBezierWithGrad<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                       const JaggedTensor& points,
-                                                                       const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridBezierWithGrad<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                              const JaggedTensor  &points,
+                                              const torch::Tensor &gridData) {
     return SampleGridBezierWithGrad<torch::kCPU>(batchHdl, points, gridData);
 }
 
-}  // namespace ops
-}  // namespace detail
-}  // namespace fvdb
-
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/ops/SampleGridBezierWithGradBackward.cu b/fvdb/src/detail/ops/SampleGridBezierWithGradBackward.cu
index c4e7f81157..2363f8448a 100644
--- a/fvdb/src/detail/ops/SampleGridBezierWithGradBackward.cu
+++ b/fvdb/src/detail/ops/SampleGridBezierWithGradBackward.cu
@@ -1,50 +1,54 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/BezierInterpolationWithGradIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
-#include <THC/THCAtomics.cuh>
 #include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/BezierInterpolationWithGradIterator.h"
-
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleBezierWithGradBackwardCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                                      JaggedAccessor<ScalarType, 2> points,
-                                                      TensorAccessor<ScalarType, 2> gradOutFeatures,
-                                                      TensorAccessor<ScalarType, 3> gradOutGradFeatures,
-                                                      BatchGridAccessor<GridType> batchAccessor,
-                                                      TensorAccessor<ScalarType, 2> outGridData) {
+template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleBezierWithGradBackwardCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                                     JaggedAccessor<ScalarType, 2> points,
+                                     TensorAccessor<ScalarType, 2> gradOutFeatures,
+                                     TensorAccessor<ScalarType, 3> gradOutGradFeatures,
+                                     BatchGridAccessor<GridType>   batchAccessor,
+                                     TensorAccessor<ScalarType, 2> outGridData) {
     using MathType = at::opmath_type<ScalarType>;
 
     auto pointsData = points.data();
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    auto transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    auto                               transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
     auto gradTransform = transform.template applyGrad<MathType>(xyz);
 
     for (auto it = BezierInterpolationWithGradIterator<MathType>(xyz); it.isValid(); ++it) {
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(it->first)) {
             const int64_t indexIjk = gridAcc.getValue(it->first) - 1 + baseOffset;
-            MathType addValue = it->second[0] * gradOutFeatures[eidx][cidx];
-            #pragma unroll
+            MathType      addValue = it->second[0] * gradOutFeatures[eidx][cidx];
+#pragma unroll
             for (int dim = 0; dim < 3; ++dim) {
-                addValue += it->second[dim + 1] * gradOutGradFeatures[eidx][cidx][dim] * gradTransform[dim];
+                addValue +=
+                    it->second[dim + 1] * gradOutGradFeatures[eidx][cidx][dim] * gradTransform[dim];
             }
             if constexpr (DeviceTag == torch::kCUDA) {
-                gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx], static_cast<ScalarType>(addValue));
+                gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx],
+                                     static_cast<ScalarType>(addValue));
             } else {
                 // FIXME: (@fwilliams) make this thread safe
                 outGridData[indexIjk][cidx] += static_cast<ScalarType>(addValue);
@@ -54,59 +58,70 @@ __hostdev__ void sampleBezierWithGradBackwardCallback(int32_t bidx, int32_t eidx
 }
 
 template <c10::DeviceType DeviceTag>
-torch::Tensor SampleGridBezierWithGradBackward(const GridBatchImpl& batchHdl,
-                                               const JaggedTensor& points,
-                                               const torch::Tensor& gradOutFeatures,
-                                               const torch::Tensor& gradOutGradFeatures,
-                                               const torch::Tensor& data) {
-    torch::Tensor dataReshape = featureCoalescedView(data);  // [N, -1]
+torch::Tensor
+SampleGridBezierWithGradBackward(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                 const torch::Tensor &gradOutFeatures,
+                                 const torch::Tensor &gradOutGradFeatures,
+                                 const torch::Tensor &data) {
+    torch::Tensor dataReshape = featureCoalescedView(data); // [N, -1]
 
     // FIXME handle more dimensions
-    torch::Tensor outGrad = torch::zeros_like(dataReshape);  // [N, -1]
-    auto outShape = spliceShape({outGrad.size(0)}, data, 1);  // [B*M, *]
+    torch::Tensor outGrad  = torch::zeros_like(dataReshape);            // [N, -1]
+    auto          outShape = spliceShape({ outGrad.size(0) }, data, 1); // [B*M, *]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridBezierWithGradBackward", ([&] {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto gradOutFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gradOutFeatures);
-            auto gradOutGradFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(gradOutGradFeatures);
-            auto outGridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGrad);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleBezierWithGradBackwardCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, outGrad.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleBezierWithGradBackwardCallback<DeviceTag, scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(outGrad.size(1), points, cb);
-            }
-
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridBezierWithGradBackward", ([&] {
+                auto batchAcc           = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto gradOutFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gradOutFeatures);
+                auto gradOutGradFeaturesAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 3>(gradOutGradFeatures);
+                auto outGridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGrad);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleBezierWithGradBackwardCallback<DeviceTag, scalar_t, GridType,
+                                                             JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc,
+                            batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, outGrad.size(1), points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleBezierWithGradBackwardCallback<DeviceTag, scalar_t, GridType,
+                                                             JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc,
+                            batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(outGrad.size(1), points, cb);
+                }
+            }));
     });
     return outGrad.reshape(outShape);
 }
 
-
 template <>
-torch::Tensor dispatchSampleGridBezierWithGradBackward<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                   const JaggedTensor& points,
-                                                                   const torch::Tensor& gradOutFeatures,
-                                                                   const torch::Tensor& gradOutGradFeatures,
-                                                                   const torch::Tensor& data) {
-    return SampleGridBezierWithGradBackward<torch::kCUDA>(batchHdl, points, gradOutFeatures, gradOutGradFeatures, data);
+torch::Tensor
+dispatchSampleGridBezierWithGradBackward<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                                       const JaggedTensor  &points,
+                                                       const torch::Tensor &gradOutFeatures,
+                                                       const torch::Tensor &gradOutGradFeatures,
+                                                       const torch::Tensor &data) {
+    return SampleGridBezierWithGradBackward<torch::kCUDA>(batchHdl, points, gradOutFeatures,
+                                                          gradOutGradFeatures, data);
 }
 
 template <>
-torch::Tensor dispatchSampleGridBezierWithGradBackward<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                  const JaggedTensor& points,
-                                                                  const torch::Tensor& gradOutFeatures,
-                                                                  const torch::Tensor& gradOutGradFeatures,
-                                                                  const torch::Tensor& data) {
-    return SampleGridBezierWithGradBackward<torch::kCPU>(batchHdl, points, gradOutFeatures, gradOutGradFeatures, data);
+torch::Tensor
+dispatchSampleGridBezierWithGradBackward<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                                      const JaggedTensor  &points,
+                                                      const torch::Tensor &gradOutFeatures,
+                                                      const torch::Tensor &gradOutGradFeatures,
+                                                      const torch::Tensor &data) {
+    return SampleGridBezierWithGradBackward<torch::kCPU>(batchHdl, points, gradOutFeatures,
+                                                         gradOutGradFeatures, data);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/SampleGridTrilinear.cu b/fvdb/src/detail/ops/SampleGridTrilinear.cu
index a78e75d62f..8c5297e026 100644
--- a/fvdb/src/detail/ops/SampleGridTrilinear.cu
+++ b/fvdb/src/detail/ops/SampleGridTrilinear.cu
@@ -1,41 +1,44 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/TrilinearInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/TrilinearInterpolationIterator.h"
-
 #include <iostream>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleTrilinearCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                         JaggedAccessor<ScalarType, 2> points,
-                                         TensorAccessor<ScalarType, 2> gridData,
-                                         BatchGridAccessor<GridType> batchAccessor,
-                                         TensorAccessor<ScalarType, 2> outFeatures) {
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleTrilinearCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                        JaggedAccessor<ScalarType, 2> points,
+                        TensorAccessor<ScalarType, 2> gridData,
+                        BatchGridAccessor<GridType>   batchAccessor,
+                        TensorAccessor<ScalarType, 2> outFeatures) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto& pointsData = points.data();
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const auto                        &pointsData = points.data();
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->tree().getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
 
-    #pragma unroll
+#pragma unroll
     for (auto it = TrilinearInterpolationIterator<MathType>(xyz); it.isValid(); ++it) {
-        const MathType wTrilinear = it->second;
-        const nanovdb::Coord ijk = it->first;
+        const MathType       wTrilinear = it->second;
+        const nanovdb::Coord ijk        = it->first;
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk)) {
             const int64_t indexIjk = gridAcc.getValue(ijk) - 1 + baseOffset;
             outFeatures[eidx][cidx] += wTrilinear * gridData[indexIjk][cidx];
@@ -44,53 +47,59 @@ __hostdev__ void sampleTrilinearCallback(int32_t bidx, int32_t eidx, int32_t cid
 }
 
 template <c10::DeviceType DeviceTag>
-std::vector<torch::Tensor> SampleGridTrilinear(const GridBatchImpl& batchHdl,
-                                               const JaggedTensor& points,
-                                               const torch::Tensor& gridData) {
-    auto opts = torch::TensorOptions().dtype(gridData.dtype()).device(gridData.device()).requires_grad(gridData.requires_grad());
-    torch::Tensor gridDataReshape = featureCoalescedView(gridData);                           // [B*N, -1]
-    torch::Tensor outFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1)}, opts);  // [B*M, -1]
-    auto outShape = spliceShape({points.rsize(0)}, gridData, 1);  // [B*M, *]
+std::vector<torch::Tensor>
+SampleGridTrilinear(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                    const torch::Tensor &gridData) {
+    auto opts = torch::TensorOptions()
+                    .dtype(gridData.dtype())
+                    .device(gridData.device())
+                    .requires_grad(gridData.requires_grad());
+    torch::Tensor gridDataReshape = featureCoalescedView(gridData);       // [B*N, -1]
+    torch::Tensor outFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1) }, opts); // [B*M, -1]
+    auto outShape = spliceShape({ points.rsize(0) }, gridData, 1);        // [B*M, *]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridTrilinear", ([&] {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto gridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
-            auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleTrilinearCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridData.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleTrilinearCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(gridData.size(1), points, cb);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridTrilinear", ([&] {
+                auto batchAcc       = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto gridDataAcc    = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
+                auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleTrilinearCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridData.size(1), points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleTrilinearCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(gridData.size(1), points, cb);
+                }
+            }));
     });
 
-    return {outFeatures.reshape(outShape)};
+    return { outFeatures.reshape(outShape) };
 }
 
-
 template <>
-std::vector<torch::Tensor> dispatchSampleGridTrilinear<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                   const JaggedTensor& points,
-                                                                   const torch::Tensor& gridData) {
-   return SampleGridTrilinear<torch::kCUDA>(batchHdl, points, gridData);
+std::vector<torch::Tensor>
+dispatchSampleGridTrilinear<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                          const torch::Tensor &gridData) {
+    return SampleGridTrilinear<torch::kCUDA>(batchHdl, points, gridData);
 }
 
 template <>
-std::vector<torch::Tensor> dispatchSampleGridTrilinear<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                  const JaggedTensor& points,
-                                                                  const torch::Tensor& gridData) {
-   return SampleGridTrilinear<torch::kCPU>(batchHdl, points, gridData);
+std::vector<torch::Tensor>
+dispatchSampleGridTrilinear<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                         const torch::Tensor &gridData) {
+    return SampleGridTrilinear<torch::kCPU>(batchHdl, points, gridData);
 }
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/SampleGridTrilinearWithGrad.cu b/fvdb/src/detail/ops/SampleGridTrilinearWithGrad.cu
index c5b8a51852..4fb25122d7 100644
--- a/fvdb/src/detail/ops/SampleGridTrilinearWithGrad.cu
+++ b/fvdb/src/detail/ops/SampleGridTrilinearWithGrad.cu
@@ -1,107 +1,120 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/TrilinearInterpolationWithGradIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/TrilinearInterpolationWithGradIterator.h"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleTrilinearWithGradCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                                 JaggedAccessor<ScalarType, 2> points,
-                                                 TensorAccessor<ScalarType, 2> gridData,
-                                                 BatchGridAccessor<GridType> batchAccessor,
-                                                 TensorAccessor<ScalarType, 2> outFeatures,
-                                                 TensorAccessor<ScalarType, 3> outGradFeatures) {
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleTrilinearWithGradCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                                JaggedAccessor<ScalarType, 2> points,
+                                TensorAccessor<ScalarType, 2> gridData,
+                                BatchGridAccessor<GridType>   batchAccessor,
+                                TensorAccessor<ScalarType, 2> outFeatures,
+                                TensorAccessor<ScalarType, 3> outGradFeatures) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto& pointsData = points.data();
+    const auto &pointsData = points.data();
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->tree().getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
 
     auto gradTransform = transform.template applyGrad<MathType>(xyz);
 
-    #pragma unroll
+#pragma unroll
     for (auto it = TrilinearInterpolationWithGradIterator<MathType>(xyz); it.isValid(); ++it) {
         const nanovdb::math::Vec4<MathType> wXYZ = it->second;
-        const nanovdb::Coord ijk = it->first;
+        const nanovdb::Coord                ijk  = it->first;
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(ijk)) {
             const int64_t indexIjk = gridAcc.getValue(ijk) - 1 + baseOffset;
             outFeatures[eidx][cidx] += wXYZ[0] * gridData[indexIjk][cidx];
-            #pragma unroll
-            for (int dim = 0; dim < 3; ++ dim) {
-                outGradFeatures[eidx][cidx][dim] += wXYZ[dim + 1] * gridData[indexIjk][cidx] * gradTransform[dim];
+#pragma unroll
+            for (int dim = 0; dim < 3; ++dim) {
+                outGradFeatures[eidx][cidx][dim] +=
+                    wXYZ[dim + 1] * gridData[indexIjk][cidx] * gradTransform[dim];
             }
         }
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-std::vector<torch::Tensor> SampleGridTrilinearWithGrad(const GridBatchImpl& batchHdl,
-                                                       const JaggedTensor& points,
-                                                       const torch::Tensor& gridData) {
-    auto opts = torch::TensorOptions().dtype(gridData.dtype()).device(gridData.device()).requires_grad(gridData.requires_grad());
-    torch::Tensor gridDataReshape = featureCoalescedView(gridData);                           // [B*N, -1]
-    torch::Tensor outFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1)}, opts);  // [B*M, -1]
-    torch::Tensor outGradFeatures = torch::zeros({points.rsize(0), gridDataReshape.size(1), 3}, opts);  // [B*M, -1, 3]
-    std::vector<int64_t> outShape = spliceShape({points.rsize(0)}, gridData, 1);  // [B*M, *]
+std::vector<torch::Tensor>
+SampleGridTrilinearWithGrad(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                            const torch::Tensor &gridData) {
+    auto opts = torch::TensorOptions()
+                    .dtype(gridData.dtype())
+                    .device(gridData.device())
+                    .requires_grad(gridData.requires_grad());
+    torch::Tensor gridDataReshape = featureCoalescedView(gridData);          // [B*N, -1]
+    torch::Tensor outFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1) }, opts);    // [B*M, -1]
+    torch::Tensor outGradFeatures =
+        torch::zeros({ points.rsize(0), gridDataReshape.size(1), 3 }, opts); // [B*M, -1, 3]
+    std::vector<int64_t> outShape     = spliceShape({ points.rsize(0) }, gridData, 1); // [B*M, *]
     std::vector<int64_t> outGradShape = outShape;
     outGradShape.push_back(3);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridTrilinearWithGrad", ([&] {
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto gridDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
-            auto outFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
-            auto outGradFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(outGradFeatures);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleTrilinearWithGradCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc, outGradFeaturesAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridData.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleTrilinearWithGradCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc, outGradFeaturesAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(gridData.size(1), points, cb);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridTrilinearWithGrad", ([&] {
+                auto batchAcc           = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto gridDataAcc        = tensorAccessor<DeviceTag, scalar_t, 2>(gridDataReshape);
+                auto outFeaturesAcc     = tensorAccessor<DeviceTag, scalar_t, 2>(outFeatures);
+                auto outGradFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 3>(outGradFeatures);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleTrilinearWithGradCallback<scalar_t, GridType, JaggedRAcc32,
+                                                        TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc,
+                            outGradFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, gridData.size(1), points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleTrilinearWithGradCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gridDataAcc, batchAcc, outFeaturesAcc,
+                            outGradFeaturesAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(gridData.size(1), points, cb);
+                }
+            }));
     });
 
-    return {
-        outFeatures.reshape(outShape),
-        outGradFeatures.reshape(outGradShape)
-    };
+    return { outFeatures.reshape(outShape), outGradFeatures.reshape(outGradShape) };
 }
 
-
 template <>
-std::vector<torch::Tensor> dispatchSampleGridTrilinearWithGrad<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                           const JaggedTensor& points,
-                                                                           const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridTrilinearWithGrad<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                                  const JaggedTensor  &points,
+                                                  const torch::Tensor &gridData) {
     return SampleGridTrilinearWithGrad<torch::kCUDA>(batchHdl, points, gridData);
 }
 
 template <>
-std::vector<torch::Tensor> dispatchSampleGridTrilinearWithGrad<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                          const JaggedTensor& points,
-                                                                          const torch::Tensor& gridData) {
+std::vector<torch::Tensor>
+dispatchSampleGridTrilinearWithGrad<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                                 const JaggedTensor  &points,
+                                                 const torch::Tensor &gridData) {
     return SampleGridTrilinearWithGrad<torch::kCPU>(batchHdl, points, gridData);
 }
 
diff --git a/fvdb/src/detail/ops/SampleGridTrilinearWithGradBackward.cu b/fvdb/src/detail/ops/SampleGridTrilinearWithGradBackward.cu
index 61727f0dcc..c155076766 100644
--- a/fvdb/src/detail/ops/SampleGridTrilinearWithGradBackward.cu
+++ b/fvdb/src/detail/ops/SampleGridTrilinearWithGradBackward.cu
@@ -1,51 +1,55 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/TrilinearInterpolationWithGradIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <ATen/OpMathType.h>
-#include <THC/THCAtomics.cuh>
 #include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/TrilinearInterpolationWithGradIterator.h"
-
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void sampleTrilinearWithGradBackwardCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                                         JaggedAccessor<ScalarType, 2> points,
-                                                         TensorAccessor<ScalarType, 2> gradOutFeatures,
-                                                         TensorAccessor<ScalarType, 3> gradOutGradFeatures,
-                                                         BatchGridAccessor<GridType> batchAccessor,
-                                                         TensorAccessor<ScalarType, 2> outGridData) {
+template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+sampleTrilinearWithGradBackwardCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                                        JaggedAccessor<ScalarType, 2> points,
+                                        TensorAccessor<ScalarType, 2> gradOutFeatures,
+                                        TensorAccessor<ScalarType, 3> gradOutGradFeatures,
+                                        BatchGridAccessor<GridType>   batchAccessor,
+                                        TensorAccessor<ScalarType, 2> outGridData) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto& pointsData = points.data();
+    const auto &pointsData = points.data();
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    auto transform = batchAccessor.primalTransform(bidx);
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    auto                               transform  = batchAccessor.primalTransform(bidx);
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
 
     auto gridAcc = gpuGrid->getAccessor();
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(static_cast<MathType>(pointsData[eidx][0]),
-                                                        static_cast<MathType>(pointsData[eidx][1]),
-                                                        static_cast<MathType>(pointsData[eidx][2]));
+    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
+        static_cast<MathType>(pointsData[eidx][0]), static_cast<MathType>(pointsData[eidx][1]),
+        static_cast<MathType>(pointsData[eidx][2]));
     auto gradTransform = transform.template applyGrad<MathType>(xyz);
 
-    #pragma unroll
+#pragma unroll
     for (auto it = TrilinearInterpolationWithGradIterator<MathType>(xyz); it.isValid(); ++it) {
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(it->first)) {
             const int64_t indexIjk = gridAcc.getValue(it->first) - 1 + baseOffset;
-            MathType addValue = it->second[0] * gradOutFeatures[eidx][cidx];
-            #pragma unroll
+            MathType      addValue = it->second[0] * gradOutFeatures[eidx][cidx];
+#pragma unroll
             for (int dim = 0; dim < 3; ++dim) {
-                addValue += it->second[dim + 1] * gradOutGradFeatures[eidx][cidx][dim] * gradTransform[dim];
+                addValue +=
+                    it->second[dim + 1] * gradOutGradFeatures[eidx][cidx][dim] * gradTransform[dim];
             }
             if constexpr (DeviceTag == torch::kCUDA) {
-                gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx], static_cast<ScalarType>(addValue));
+                gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx],
+                                     static_cast<ScalarType>(addValue));
             } else {
                 outGridData[indexIjk][cidx] += static_cast<ScalarType>(addValue);
             }
@@ -54,58 +58,67 @@ __hostdev__ void sampleTrilinearWithGradBackwardCallback(int32_t bidx, int32_t e
 }
 
 template <c10::DeviceType DeviceTag>
-torch::Tensor SampleGridTrilinearWithGradBackward(const GridBatchImpl& batchHdl,
-                                                  const JaggedTensor& points,
-                                                  const torch::Tensor& data,
-                                                  const torch::Tensor& gradOutFeatures,
-                                                  const torch::Tensor& gradOutGradFeatures) {
-    torch::Tensor dataReshape = featureCoalescedView(data);  // [N, -1]
+torch::Tensor
+SampleGridTrilinearWithGradBackward(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                    const torch::Tensor &data, const torch::Tensor &gradOutFeatures,
+                                    const torch::Tensor &gradOutGradFeatures) {
+    torch::Tensor dataReshape = featureCoalescedView(data); // [N, -1]
 
     // FIXME handle more dimensions
-    torch::Tensor outGrad = torch::zeros_like(dataReshape);  // [N, -1]
-    auto outShape = spliceShape({outGrad.size(0)}, data, 1);  // [B*M, *]
+    torch::Tensor outGrad  = torch::zeros_like(dataReshape);            // [N, -1]
+    auto          outShape = spliceShape({ outGrad.size(0) }, data, 1); // [B*M, *]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SampleGridTrilinearWithGradBackward", ([&] {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto gradOutFeaturesAcc =  tensorAccessor<DeviceTag, scalar_t, 2>(gradOutFeatures);
-            auto gradOutGradFeaturesAcc =  tensorAccessor<DeviceTag, scalar_t, 3>(gradOutGradFeatures);
-            auto outGradAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGrad);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    sampleTrilinearWithGradBackwardCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc, batchAcc, outGradAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, outGrad.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    sampleTrilinearWithGradBackwardCallback<DeviceTag, scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc, batchAcc, outGradAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(outGrad.size(1), points, cb);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SampleGridTrilinearWithGradBackward", ([&] {
+                auto batchAcc           = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto gradOutFeaturesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(gradOutFeatures);
+                auto gradOutGradFeaturesAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 3>(gradOutGradFeatures);
+                auto outGradAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGrad);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        sampleTrilinearWithGradBackwardCallback<DeviceTag, scalar_t, GridType,
+                                                                JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc,
+                            batchAcc, outGradAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, outGrad.size(1), points, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        sampleTrilinearWithGradBackwardCallback<DeviceTag, scalar_t, GridType,
+                                                                JaggedAcc, TorchAcc>(
+                            bidx, eidx, cidx, pts, gradOutFeaturesAcc, gradOutGradFeaturesAcc,
+                            batchAcc, outGradAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(outGrad.size(1), points, cb);
+                }
+            }));
     });
     return outGrad.reshape(outShape);
 }
 
-
 template <>
-torch::Tensor dispatchSampleGridTrilinearWithGradBackward<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                      const JaggedTensor& points,
-                                                                      const torch::Tensor& data,
-                                                                      const torch::Tensor& gradOutFeatures,
-                                                                      const torch::Tensor& gradOutGradFeatures) {
-    return SampleGridTrilinearWithGradBackward<torch::kCUDA>(batchHdl, points, data, gradOutFeatures, gradOutGradFeatures);
+torch::Tensor
+dispatchSampleGridTrilinearWithGradBackward<torch::kCUDA>(
+    const GridBatchImpl &batchHdl, const JaggedTensor &points, const torch::Tensor &data,
+    const torch::Tensor &gradOutFeatures, const torch::Tensor &gradOutGradFeatures) {
+    return SampleGridTrilinearWithGradBackward<torch::kCUDA>(batchHdl, points, data,
+                                                             gradOutFeatures, gradOutGradFeatures);
 }
 
 template <>
-torch::Tensor dispatchSampleGridTrilinearWithGradBackward<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                     const JaggedTensor& points,
-                                                                     const torch::Tensor& data,
-                                                                     const torch::Tensor& gradOutFeatures,
-                                                                     const torch::Tensor& gradOutGradFeatures) {
-    return SampleGridTrilinearWithGradBackward<torch::kCPU>(batchHdl, points, data, gradOutFeatures, gradOutGradFeatures);
+torch::Tensor
+dispatchSampleGridTrilinearWithGradBackward<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                                         const JaggedTensor  &points,
+                                                         const torch::Tensor &data,
+                                                         const torch::Tensor &gradOutFeatures,
+                                                         const torch::Tensor &gradOutGradFeatures) {
+    return SampleGridTrilinearWithGradBackward<torch::kCPU>(batchHdl, points, data, gradOutFeatures,
+                                                            gradOutGradFeatures);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/SampleRaysUniform.cu b/fvdb/src/detail/ops/SampleRaysUniform.cu
index 28795e7c4f..2224a768e5 100644
--- a/fvdb/src/detail/ops/SampleRaysUniform.cu
+++ b/fvdb/src/detail/ops/SampleRaysUniform.cu
@@ -1,52 +1,48 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename ScalarType>
-__hostdev__ float _calcDt(ScalarType t, ScalarType coneAngle, ScalarType minStepSize,
-                           const ScalarType maxStepSize) {
+__hostdev__ float
+_calcDt(ScalarType t, ScalarType coneAngle, ScalarType minStepSize, const ScalarType maxStepSize) {
     return nanovdb::math::Clamp(t * coneAngle, minStepSize, maxStepSize);
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void countSamplesPerRayCallback(int32_t bidx, int32_t eidx,
-                                            const JaggedAccessor<ScalarType, 2> rayO,     // [B*M, 3]
-                                            const JaggedAccessor<ScalarType, 2> rayD,     // [B*M, 3]
-                                            const JaggedAccessor<ScalarType, 1> tMin,     // [B*M,]
-                                            const JaggedAccessor<ScalarType, 1> tMax,     // [B*M]
-                                            TensorAccessor<int32_t, 1> outRayCounts,      // [B*M]
-                                            BatchGridAccessor<GridType> batchAccessor,
-                                            ScalarType minStepSize,
-                                            ScalarType coneAngle,
-                                            bool includeEndpointSegments,
-                                            ScalarType eps) {
-
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+countSamplesPerRayCallback(int32_t bidx, int32_t eidx,
+                           const JaggedAccessor<ScalarType, 2> rayO,         // [B*M, 3]
+                           const JaggedAccessor<ScalarType, 2> rayD,         // [B*M, 3]
+                           const JaggedAccessor<ScalarType, 1> tMin,         // [B*M,]
+                           const JaggedAccessor<ScalarType, 1> tMax,         // [B*M]
+                           TensorAccessor<int32_t, 1>          outRayCounts, // [B*M]
+                           BatchGridAccessor<GridType> batchAccessor, ScalarType minStepSize,
+                           ScalarType coneAngle, bool includeEndpointSegments, ScalarType eps) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
 
     VoxelCoordTransform transform = batchAccessor.dualTransform(bidx);
-    nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
+    nanovdb::CoordBBox  dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                gridAcc   = gpuGrid->getAccessor();
 
-    const auto& rayOi = rayO.data()[eidx];
-    const auto& rayDi = rayD.data()[eidx];
-    const ScalarType tMini = tMin.data()[eidx];
-    const ScalarType tMaxi = tMax.data()[eidx];
+    const auto                    &rayOi  = rayO.data()[eidx];
+    const auto                    &rayDi  = rayD.data()[eidx];
+    const ScalarType               tMini  = tMin.data()[eidx];
+    const ScalarType               tMaxi  = tMax.data()[eidx];
     nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(
-        rayOi[0], rayOi[1], rayOi[2], rayDi[0], rayDi[1], rayDi[2], tMini, tMaxi
-    );
+        rayOi[0], rayOi[1], rayOi[2], rayDi[0], rayDi[1], rayDi[2], tMini, tMaxi);
 
     if (!rayVox.clip(dualBbox)) {
-        outRayCounts[eidx+1] = 0;
+        outRayCounts[eidx + 1] = 0;
         return;
     }
 
@@ -56,13 +52,13 @@ __hostdev__ void countSamplesPerRayCallback(int32_t bidx, int32_t eidx,
     ScalarType maxStepSize = static_cast<ScalarType>(1e10);
 
     // Count samples
-    ScalarType t0 = tMini;
+    ScalarType t0       = tMini;
     ScalarType stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
     ScalarType t1;
 
     // For each contiguous segment of voxels
-    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, false); it.isValid(); ++it) {
-
+    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, false);
+         it.isValid(); ++it) {
         const ScalarType deltaT = it->t1 - it->t0;
         if (deltaT < eps) {
             continue;
@@ -89,7 +85,7 @@ __hostdev__ void countSamplesPerRayCallback(int32_t bidx, int32_t eidx,
 
             while (t1 < it->t1) {
                 numSamples += 1;
-                t0 = t1;
+                t0       = t1;
                 stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
                 t1 += stepSize;
             }
@@ -102,56 +98,53 @@ __hostdev__ void countSamplesPerRayCallback(int32_t bidx, int32_t eidx,
         } else {
             // Step t0 consistently until it intersects the voxel (tmid is in the voxel)
             ScalarType distToVox = it->t0 - t0;
-            t0 = t0 + c10::cuda::compat::floor(distToVox / stepSize + 0.5f) * stepSize;
+            t0       = t0 + c10::cuda::compat::floor(distToVox / stepSize + 0.5f) * stepSize;
             stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
-            t1 = t0 + stepSize;
+            t1       = t0 + stepSize;
             while ((t0 + t1) * 0.5 < it->t1 && (t0 + t1) * 0.5 >= it->t0) {
                 numSamples += 1;
-                t0 = t1;
+                t0       = t1;
                 stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
                 t1 += stepSize;
             }
         }
     }
-    outRayCounts[eidx+1] = numSamples;
+    outRayCounts[eidx + 1] = numSamples;
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
-                                            const JaggedAccessor<ScalarType, 2> rayO,     // [B*M, 3]
-                                            const JaggedAccessor<ScalarType, 2> rayD,     // [B*M, 3]
-                                            const JaggedAccessor<ScalarType, 1> tMin,     // [B*M,]
-                                            const JaggedAccessor<ScalarType, 1> tMax,     // [B*M]
-                                            const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets,  // [B*M, 2]
-                                            TensorAccessor<fvdb::JIdxType, 1> outJIdx,                // [B*M, 2]
-                                            TensorAccessor<fvdb::JLIdxType, 2> outJLIdx,               // [B*M, 2]
-                                            TensorAccessor<ScalarType, 2> outRayTimes,    // [B*M*S, 2]
-                                            BatchGridAccessor<GridType> batchAccessor,
-                                            ScalarType minStepSize,
-                                            ScalarType coneAngle,
-                                            bool includeEndpointSegments,
-                                            bool returnMidpoint,
-                                            ScalarType eps) {
-
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
+                           const JaggedAccessor<ScalarType, 2>         rayO,        // [B*M, 3]
+                           const JaggedAccessor<ScalarType, 2>         rayD,        // [B*M, 3]
+                           const JaggedAccessor<ScalarType, 1>         tMin,        // [B*M,]
+                           const JaggedAccessor<ScalarType, 1>         tMax,        // [B*M]
+                           const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets, // [B*M, 2]
+                           TensorAccessor<fvdb::JIdxType, 1>           outJIdx,     // [B*M, 2]
+                           TensorAccessor<fvdb::JLIdxType, 2>          outJLIdx,    // [B*M, 2]
+                           TensorAccessor<ScalarType, 2>               outRayTimes, // [B*M*S, 2]
+                           BatchGridAccessor<GridType> batchAccessor, ScalarType minStepSize,
+                           ScalarType coneAngle, bool includeEndpointSegments, bool returnMidpoint,
+                           ScalarType eps) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
 
     VoxelCoordTransform transform = batchAccessor.dualTransform(bidx);
-    nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
+    nanovdb::CoordBBox  dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                gridAcc   = gpuGrid->getAccessor();
 
-    const auto& rayOi = rayO.data()[rayIdx];
-    const auto& rayDi = rayD.data()[rayIdx];
-    const ScalarType tMini = tMin.data()[rayIdx];
-    const ScalarType tMaxi = tMax.data()[rayIdx];
+    const auto                    &rayOi  = rayO.data()[rayIdx];
+    const auto                    &rayDi  = rayD.data()[rayIdx];
+    const ScalarType               tMini  = tMin.data()[rayIdx];
+    const ScalarType               tMaxi  = tMax.data()[rayIdx];
     nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(
-        rayOi[0], rayOi[1], rayOi[2], rayDi[0], rayDi[1], rayDi[2], tMini, tMaxi
-    );
+        rayOi[0], rayOi[1], rayOi[2], rayDi[0], rayDi[1], rayDi[2], tMini, tMaxi);
 
     if (outJLIdx.size(0) > 0) {
         const fvdb::JLIdxType batchStartIdx = rayO.offsetStart(bidx);
-        outJLIdx[rayIdx][0] = bidx;
-        outJLIdx[rayIdx][1] = rayIdx - batchStartIdx;
+        outJLIdx[rayIdx][0]                 = bidx;
+        outJLIdx[rayIdx][1]                 = rayIdx - batchStartIdx;
     }
 
     if (!rayVox.clip(dualBbox)) {
@@ -164,14 +157,15 @@ __hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
     ScalarType maxStepSize = static_cast<ScalarType>(1e10);
 
     // Track ray sample and region of space which it occupies
-    ScalarType t0 = tMini;
+    ScalarType t0       = tMini;
     ScalarType stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
     ScalarType t1;
 
     const fvdb::JOffsetsType rayStartIdx = outJOffsets[rayIdx];
 
     // For each contiguous segment of voxels
-    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, false); it.isValid(); ++it) {
+    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, false);
+         it.isValid(); ++it) {
         const ScalarType deltaT = it->t1 - it->t0;
         if (deltaT < eps) {
             continue;
@@ -219,7 +213,7 @@ __hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
                 }
                 outJIdx[rayStartIdx + numSamples] = rayIdx;
                 numSamples += 1;
-                t0 = t1;
+                t0       = t1;
                 stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
                 t1 += stepSize;
             }
@@ -239,9 +233,9 @@ __hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
         } else {
             // Step t0 consistently until it intersects the voxel (tmid is in the voxel)
             ScalarType distToVox = it->t0 - t0;
-            t0 = t0 + c10::cuda::compat::floor(distToVox / stepSize + 0.5f) * stepSize;
+            t0       = t0 + c10::cuda::compat::floor(distToVox / stepSize + 0.5f) * stepSize;
             stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
-            t1 = t0 + stepSize;
+            t1       = t0 + stepSize;
             while ((t0 + t1) * 0.5 < it->t1 && (t0 + t1) * 0.5 >= it->t0) {
                 if (returnMidpoint) {
                     outRayTimes[rayStartIdx + numSamples][0] = (t0 + t1) / ScalarType(2.0);
@@ -251,7 +245,7 @@ __hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
                 }
                 outJIdx[rayStartIdx + numSamples] = rayIdx;
                 numSamples += 1;
-                t0 = t1;
+                t0       = t1;
                 stepSize = _calcDt(t0, coneAngle, minStepSize, maxStepSize);
                 t1 += stepSize;
             }
@@ -260,187 +254,183 @@ __hostdev__ void generateRaySamplesCallback(int32_t bidx, int32_t rayIdx,
 }
 
 template <c10::DeviceType DeviceTag>
-JaggedTensor UniformRaySamples(const GridBatchImpl& batchHdl,
-                               const JaggedTensor& rayOrigins,
-                               const JaggedTensor& rayDirections,
-                               const JaggedTensor& tMin,
-                               const JaggedTensor& tMax,
-                               const double minStepSize,
-                               const double coneAngle,
-                               const bool includeEndpointSegments,
-                               const bool returnMidpoint,
-                               const double eps) {
-
+JaggedTensor
+UniformRaySamples(const GridBatchImpl &batchHdl, const JaggedTensor &rayOrigins,
+                  const JaggedTensor &rayDirections, const JaggedTensor &tMin,
+                  const JaggedTensor &tMax, const double minStepSize, const double coneAngle,
+                  const bool includeEndpointSegments, const bool returnMidpoint, const double eps) {
     batchHdl.checkDevice(rayOrigins);
     batchHdl.checkDevice(rayDirections);
     batchHdl.checkDevice(tMin);
     batchHdl.checkDevice(tMax);
     TORCH_CHECK_TYPE(rayOrigins.is_floating_point(), "ray_origins must have a floating point type");
-    TORCH_CHECK_TYPE(rayDirections.is_floating_point(), "ray_directions must have a floating point type");
+    TORCH_CHECK_TYPE(rayDirections.is_floating_point(),
+                     "ray_directions must have a floating point type");
     TORCH_CHECK_TYPE(tMin.is_floating_point(), "tmin must have a floating point type");
     TORCH_CHECK_TYPE(tMax.is_floating_point(), "tmax must have a floating point type");
 
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(), "ray_origins must have the same batch size as the grid batch");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(), "ray_directions must have the same batch size as the grid batch");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == tMin.num_outer_lists(), "t_min must have the same batch size as the grid batch");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == tMax.num_outer_lists(), "t_max must have the same batch size as the grid batch");
-
-    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(), "all tensors must have the same type");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(),
+                      "ray_origins must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(),
+                      "ray_directions must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == tMin.num_outer_lists(),
+                      "t_min must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == tMax.num_outer_lists(),
+                      "t_max must have the same batch size as the grid batch");
+
+    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(),
+                     "all tensors must have the same type");
     TORCH_CHECK_TYPE(tMin.dtype() == tMin.dtype(), "all tensors must have the same type");
     TORCH_CHECK_TYPE(tMin.dtype() == rayOrigins.dtype(), "all tensors must have the same type");
 
-    TORCH_CHECK(rayOrigins.rdim() == 2, std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
-                                 std::to_string(rayOrigins.rdim()) + " dimensions");
-    TORCH_CHECK(rayDirections.rdim() == 2, std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
-                                 std::to_string(rayDirections.rdim()) + " dimensions");
-    TORCH_CHECK(tMin.rdim() == 1, std::string("Expected tmin to have 1 dimension (shape (n,)) but got ") +
-                                 std::to_string(tMin.rdim()) + " dimensions");
-    TORCH_CHECK(tMax.rdim() == 1, std::string("Expected tmin to have 1 dimension (shape (n,)) but got ") +
-                                 std::to_string(tMax.rdim()) + " dimensions");
+    TORCH_CHECK(rayOrigins.rdim() == 2,
+                std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(rayOrigins.rdim()) + " dimensions");
+    TORCH_CHECK(
+        rayDirections.rdim() == 2,
+        std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayDirections.rdim()) + " dimensions");
+    TORCH_CHECK(tMin.rdim() == 1,
+                std::string("Expected tmin to have 1 dimension (shape (n,)) but got ") +
+                    std::to_string(tMin.rdim()) + " dimensions");
+    TORCH_CHECK(tMax.rdim() == 1,
+                std::string("Expected tmin to have 1 dimension (shape (n,)) but got ") +
+                    std::to_string(tMax.rdim()) + " dimensions");
     TORCH_CHECK(rayOrigins.rsize(0) == tMin.rsize(0),
                 "ray_origins and tmin must have the same size in dimension 0 but got " +
-                std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(tMin.rsize(0)));
+                    std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(tMin.rsize(0)));
     TORCH_CHECK(rayOrigins.rsize(0) == tMax.rsize(0),
                 "ray_origins and tmin must have the same size in dimension 0 but got " +
-                std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(tMin.rsize(0)));
+                    std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(tMin.rsize(0)));
     TORCH_CHECK(rayOrigins.rsize(0) == rayDirections.rsize(0),
                 "ray_origins and ray_directions must have the same size in dimension 0 but got " +
-                std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(rayDirections.rsize(0)));
+                    std::to_string(rayOrigins.rsize(0)) + " and " +
+                    std::to_string(rayDirections.rsize(0)));
     TORCH_CHECK(minStepSize > 0.0, "minStepSize must be positive");
     TORCH_CHECK(coneAngle >= 0.0, "coneAngle must be none negitive");
     TORCH_CHECK(rayOrigins.ldim() == 1, "Invalid list dimension for ray origins.");
     TORCH_CHECK(rayDirections.ldim() == 1, "Invalid list dimension for ray directions.");
 
     return FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() -> JaggedTensor {
-        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(rayOrigins.scalar_type(), "UniformRaySamples", [&]() -> JaggedTensor {
-            int64_t numThreads = 256 + 128;
-            if constexpr (nanovdb::util::is_same<scalar_t, double>::value) {
-                numThreads = 256;
-            }
-            const auto optsF = torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
-            const auto optsI32 = torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
-            const auto optsJIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
-            const auto optsJOffsets = torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(rayOrigins.device());
-            const auto optsJLIdx = torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto rayOriginsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayOrigins);
-            auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
-
-            auto tMinAcc = jaggedAccessor<DeviceTag, scalar_t, 1>(tMin);
-            auto tMaxAcc = jaggedAccessor<DeviceTag, scalar_t, 1>(tMax);
-
-            // Count number of segments along each ray
-            torch::Tensor rayCounts = torch::zeros({rayOrigins.rsize(0) + 1}, optsI32);  // [B*M]
-            auto outCountsAcc = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
-                    countSamplesPerRayCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        tMinAcc, tMaxAcc,
-                        outCountsAcc,
-                        batchAcc,
-                        minStepSize,
-                        coneAngle,
-                        includeEndpointSegments,
-                        eps);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rayOriginsAcc) {
-                    countSamplesPerRayCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        tMinAcc, tMaxAcc,
-                        outCountsAcc,
-                        batchAcc,
-                        minStepSize,
-                        coneAngle,
-                        includeEndpointSegments,
-                        eps);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb);
-            }
+        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            rayOrigins.scalar_type(), "UniformRaySamples", [&]() -> JaggedTensor {
+                int64_t numThreads = 256 + 128;
+                if constexpr (nanovdb::util::is_same<scalar_t, double>::value) {
+                    numThreads = 256;
+                }
+                const auto optsF =
+                    torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
+                const auto optsI32 =
+                    torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
+                const auto optsJIdx =
+                    torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
+                const auto optsJOffsets = torch::TensorOptions()
+                                              .dtype(fvdb::JOffsetsScalarType)
+                                              .device(rayOrigins.device());
+                const auto optsJLIdx =
+                    torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
+
+                auto batchAcc         = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto rayOriginsAcc    = jaggedAccessor<DeviceTag, scalar_t, 2>(rayOrigins);
+                auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
+
+                auto tMinAcc = jaggedAccessor<DeviceTag, scalar_t, 1>(tMin);
+                auto tMaxAcc = jaggedAccessor<DeviceTag, scalar_t, 1>(tMax);
+
+                // Count number of segments along each ray
+                torch::Tensor rayCounts =
+                    torch::zeros({ rayOrigins.rsize(0) + 1 }, optsI32); // [B*M]
+                auto outCountsAcc = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
+                        countSamplesPerRayCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, tMinAcc, tMaxAcc,
+                            outCountsAcc, batchAcc, minStepSize, coneAngle, includeEndpointSegments,
+                            eps);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> rayOriginsAcc) {
+                        countSamplesPerRayCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, tMinAcc, tMaxAcc,
+                            outCountsAcc, batchAcc, minStepSize, coneAngle, includeEndpointSegments,
+                            eps);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb);
+                }
 
-            // Compute joffsets for the output samples
-            torch::Tensor outJOffsets = rayCounts.cumsum(0, fvdb::JOffsetsScalarType);  // [B*M]
-            const fvdb::JOffsetsType totalSamples = outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
-
-            // Allocate output JaggedTensor indexing data
-            torch::Tensor outJLidx = torch::empty({outJOffsets.size(0) - 1, 2}, optsJLIdx);    // [total_rays, 2]
-            torch::Tensor outJIdx = torch::zeros({totalSamples}, optsJIdx); // [total_intersections]
-
-            // Allocate output tensors
-            torch::Tensor outRayTimes = torch::zeros({totalSamples, returnMidpoint ? 1 : 2}, optsF);   // [B*M*S, 2]
-
-            // Compute output voxels and times
-            auto outJOffsetsAcc = tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
-            auto outJIdxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
-            auto outJLIdxAcc = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
-
-            auto outRayTimesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outRayTimes);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
-                    generateRaySamplesCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        tMinAcc, tMaxAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outRayTimesAcc,
-                        batchAcc, minStepSize, coneAngle, includeEndpointSegments, returnMidpoint, eps);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rayOriginsAcc) {
-                    generateRaySamplesCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        tMinAcc, tMaxAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outRayTimesAcc,
-                        batchAcc, minStepSize, coneAngle, includeEndpointSegments, returnMidpoint, eps);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb);
-            }
+                // Compute joffsets for the output samples
+                torch::Tensor outJOffsets = rayCounts.cumsum(0, fvdb::JOffsetsScalarType); // [B*M]
+                const fvdb::JOffsetsType totalSamples =
+                    outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
+
+                // Allocate output JaggedTensor indexing data
+                torch::Tensor outJLidx =
+                    torch::empty({ outJOffsets.size(0) - 1, 2 }, optsJLIdx); // [total_rays, 2]
+                torch::Tensor outJIdx =
+                    torch::zeros({ totalSamples }, optsJIdx); // [total_intersections]
+
+                // Allocate output tensors
+                torch::Tensor outRayTimes =
+                    torch::zeros({ totalSamples, returnMidpoint ? 1 : 2 }, optsF); // [B*M*S, 2]
+
+                // Compute output voxels and times
+                auto outJOffsetsAcc = tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
+                auto outJIdxAcc     = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
+                auto outJLIdxAcc    = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
+
+                auto outRayTimesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outRayTimes);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
+                        generateRaySamplesCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, tMinAcc, tMaxAcc,
+                            outJOffsetsAcc, outJIdxAcc, outJLIdxAcc, outRayTimesAcc, batchAcc,
+                            minStepSize, coneAngle, includeEndpointSegments, returnMidpoint, eps);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> rayOriginsAcc) {
+                        generateRaySamplesCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, tMinAcc, tMaxAcc,
+                            outJOffsetsAcc, outJIdxAcc, outJLIdxAcc, outRayTimesAcc, batchAcc,
+                            minStepSize, coneAngle, includeEndpointSegments, returnMidpoint, eps);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb);
+                }
 
-            if (returnMidpoint) {
-                outRayTimes = outRayTimes.squeeze(-1);
-            }
-            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-                outRayTimes, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
-        });
+                if (returnMidpoint) {
+                    outRayTimes = outRayTimes.squeeze(-1);
+                }
+                return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+                    outRayTimes, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
+            });
     });
 }
 
-
-
 template <>
-JaggedTensor dispatchUniformRaySamples<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                     const JaggedTensor& rayO,
-                                                     const JaggedTensor& rayD,
-                                                     const JaggedTensor& tMin,
-                                                     const JaggedTensor& tMax,
-                                                     const double minStepSize,
-                                                     const double coneAngle,
-                                                     const bool includeEndSegments,
-                                                     const bool returnMidpoint,
-                                                     const double eps) {
-    return UniformRaySamples<torch::kCUDA>(batchHdl, rayO, rayD, tMin, tMax, minStepSize, coneAngle, includeEndSegments, returnMidpoint, eps);
+JaggedTensor
+dispatchUniformRaySamples<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &rayO,
+                                        const JaggedTensor &rayD, const JaggedTensor &tMin,
+                                        const JaggedTensor &tMax, const double minStepSize,
+                                        const double coneAngle, const bool includeEndSegments,
+                                        const bool returnMidpoint, const double eps) {
+    return UniformRaySamples<torch::kCUDA>(batchHdl, rayO, rayD, tMin, tMax, minStepSize, coneAngle,
+                                           includeEndSegments, returnMidpoint, eps);
 }
 template <>
-JaggedTensor dispatchUniformRaySamples<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                    const JaggedTensor& rayO,
-                                                    const JaggedTensor& rayD,
-                                                    const JaggedTensor& tMin,
-                                                    const JaggedTensor& tMax,
-                                                    const double minStepSize,
-                                                    const double coneAngle,
-                                                    const bool includeEndSegments,
-                                                    const bool returnMidpoint,
-                                                    const double eps) {
-    return UniformRaySamples<torch::kCPU>(batchHdl, rayO, rayD, tMin, tMax, minStepSize, coneAngle, includeEndSegments, returnMidpoint, eps);
+JaggedTensor
+dispatchUniformRaySamples<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &rayO,
+                                       const JaggedTensor &rayD, const JaggedTensor &tMin,
+                                       const JaggedTensor &tMax, const double minStepSize,
+                                       const double coneAngle, const bool includeEndSegments,
+                                       const bool returnMidpoint, const double eps) {
+    return UniformRaySamples<torch::kCPU>(batchHdl, rayO, rayD, tMin, tMax, minStepSize, coneAngle,
+                                          includeEndSegments, returnMidpoint, eps);
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/ScaledDotProductAttention.cu b/fvdb/src/detail/ops/ScaledDotProductAttention.cu
index ed6d3c45c6..c21d8049fc 100644
--- a/fvdb/src/detail/ops/ScaledDotProductAttention.cu
+++ b/fvdb/src/detail/ops/ScaledDotProductAttention.cu
@@ -1,8 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
+#include <detail/utils/BezierInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 #if (defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12)
 #include <ATen/cudnn/Handle.h>
@@ -10,47 +13,41 @@
 #include <cudnn_frontend.h>
 #endif
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/BezierInterpolationIterator.h"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-static torch::Tensor exclusivePrefixSum(torch::Tensor input) {
-    return torch::cat({torch::zeros(1, input.options()), input.cumsum(0)});
+static torch::Tensor
+exclusivePrefixSum(torch::Tensor input) {
+    return torch::cat({ torch::zeros(1, input.options()), input.cumsum(0) });
 }
 
 template <>
-torch::Tensor dispatchScaledDotProductAttention<torch::kCUDA>(const torch::Tensor& query,
-                                                              const torch::Tensor& key,
-                                                              const torch::Tensor& value,
-                                                              const torch::Tensor& qLengths,
-                                                              const torch::Tensor& kvLengths,
-                                                              bool training,
-                                                              float scale) {
-
+torch::Tensor
+dispatchScaledDotProductAttention<torch::kCUDA>(
+    const torch::Tensor &query, const torch::Tensor &key, const torch::Tensor &value,
+    const torch::Tensor &qLengths, const torch::Tensor &kvLengths, bool training, float scale) {
 #if (defined(__CUDACC_VER_MAJOR__) && __CUDACC_VER_MAJOR__ >= 12)
     // TODO: Cache built execution graph and plans!
     // Get dimensions: query (B*Sq, H, D), key (B*Skv, H, D), value (B*Skv, H, T)
     // https://github.com/NVIDIA/cudnn-frontend/blob/main/docs/operations/Attention.md
 
-    int64_t num_batch = qLengths.size(0);
-    int64_t num_heads = query.size(1);
+    int64_t num_batch   = qLengths.size(0);
+    int64_t num_heads   = query.size(1);
     int64_t num_qk_feat = query.size(2);
-    int64_t num_v_feat = value.size(2);
-    int64_t num_sq = qLengths.max().item<int64_t>();
-    int64_t num_skv = kvLengths.max().item<int64_t>();
+    int64_t num_v_feat  = value.size(2);
+    int64_t num_sq      = qLengths.max().item<int64_t>();
+    int64_t num_skv     = kvLengths.max().item<int64_t>();
 
     TORCH_CHECK(query.is_contiguous(), "query tensor must be contiguous");
     TORCH_CHECK(key.is_contiguous(), "key tensor must be contiguous");
     TORCH_CHECK(value.is_contiguous(), "value tensor must be contiguous");
-    TORCH_CHECK_EQ(num_qk_feat, num_v_feat) << "Different query and key dimensions seem not to be supported";
+    TORCH_CHECK_EQ(num_qk_feat, num_v_feat)
+        << "Different query and key dimensions seem not to be supported";
 
     // Init cuDNN execution graph
     namespace fe = cudnn_frontend;
-    auto graph = std::make_shared<fe::graph::Graph>();
+    auto graph   = std::make_shared<fe::graph::Graph>();
     graph->set_compute_data_type(fe::DataType_t::FLOAT);
     graph->set_intermediate_data_type(fe::DataType_t::FLOAT);
     if (query.scalar_type() == torch::kBFloat16) {
@@ -61,35 +58,46 @@ torch::Tensor dispatchScaledDotProductAttention<torch::kCUDA>(const torch::Tenso
 
     // Create input tensor nodes (cudnn_frontend::graph::Tensor_attributes)
     // (although storage is BSHD, MHA needs dimension to be BHSD)
-    auto qNode = graph->tensor(fe::graph::Tensor_attributes().set_name("Q")
-        .set_dim({num_batch, num_heads, num_sq, num_qk_feat})
-        .set_stride({num_sq * num_heads * num_qk_feat, num_qk_feat, num_heads * num_qk_feat, 1}));
-    auto kNode = graph->tensor(fe::graph::Tensor_attributes().set_name("K")
-        .set_dim({num_batch, num_heads, num_skv, num_qk_feat})
-        .set_stride({num_skv * num_heads * num_qk_feat, num_qk_feat, num_heads * num_qk_feat, 1}));
-    auto vNode = graph->tensor(fe::graph::Tensor_attributes().set_name("V")
-        .set_dim({num_batch, num_heads, num_skv, num_v_feat})
-        .set_stride({num_skv * num_heads * num_v_feat, num_v_feat, num_heads * num_v_feat, 1}));
-    auto qLenNode = graph->tensor(fe::graph::Tensor_attributes().set_name("seq_len_q")
-        .set_dim({num_batch, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
-    auto kvLenNode = graph->tensor(fe::graph::Tensor_attributes().set_name("seq_len_kv")
-        .set_dim({num_batch, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
-    auto qOffsetNode = graph->tensor(fe::graph::Tensor_attributes().set_name("offset_q")
-        .set_dim({num_batch + 1, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
-    auto kOffsetNode = graph->tensor(fe::graph::Tensor_attributes().set_name("offset_k")
-        .set_dim({num_batch + 1, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
-    auto vOffsetNode = graph->tensor(fe::graph::Tensor_attributes().set_name("offset_v")
-        .set_dim({num_batch + 1, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
+    auto qNode       = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("Q")
+                                         .set_dim({ num_batch, num_heads, num_sq, num_qk_feat })
+                                         .set_stride({ num_sq * num_heads * num_qk_feat, num_qk_feat,
+                                                       num_heads * num_qk_feat, 1 }));
+    auto kNode       = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("K")
+                                         .set_dim({ num_batch, num_heads, num_skv, num_qk_feat })
+                                         .set_stride({ num_skv * num_heads * num_qk_feat, num_qk_feat,
+                                                       num_heads * num_qk_feat, 1 }));
+    auto vNode       = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("V")
+                                         .set_dim({ num_batch, num_heads, num_skv, num_v_feat })
+                                         .set_stride({ num_skv * num_heads * num_v_feat, num_v_feat,
+                                                       num_heads * num_v_feat, 1 }));
+    auto qLenNode    = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("seq_len_q")
+                                         .set_dim({ num_batch, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
+    auto kvLenNode   = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("seq_len_kv")
+                                         .set_dim({ num_batch, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
+    auto qOffsetNode = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("offset_q")
+                                         .set_dim({ num_batch + 1, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
+    auto kOffsetNode = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("offset_k")
+                                         .set_dim({ num_batch + 1, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
+    auto vOffsetNode = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("offset_v")
+                                         .set_dim({ num_batch + 1, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
     qNode->set_ragged_offset(qOffsetNode);
     kNode->set_ragged_offset(kOffsetNode);
     vNode->set_ragged_offset(vOffsetNode);
@@ -103,12 +111,14 @@ torch::Tensor dispatchScaledDotProductAttention<torch::kCUDA>(const torch::Tenso
 
     // Create SPDA and output node
     auto [O, stats] = graph->sdpa(qNode, kNode, vNode, sdpa_options);
-    O->set_output(true).set_dim({num_batch, num_heads, num_sq, num_v_feat})
-        .set_stride({num_sq * num_heads * num_v_feat, num_v_feat, num_heads * num_v_feat, 1});
-    auto oOffsetNode = graph->tensor(fe::graph::Tensor_attributes().set_name("offset_o")
-        .set_dim({num_batch + 1, 1, 1, 1})
-        .set_stride({1, 1, 1, 1})
-        .set_data_type(fe::DataType_t::INT32));
+    O->set_output(true)
+        .set_dim({ num_batch, num_heads, num_sq, num_v_feat })
+        .set_stride({ num_sq * num_heads * num_v_feat, num_v_feat, num_heads * num_v_feat, 1 });
+    auto oOffsetNode = graph->tensor(fe::graph::Tensor_attributes()
+                                         .set_name("offset_o")
+                                         .set_dim({ num_batch + 1, 1, 1, 1 })
+                                         .set_stride({ 1, 1, 1, 1 })
+                                         .set_data_type(fe::DataType_t::INT32));
     O->set_ragged_offset(oOffsetNode);
 
     if (training) {
@@ -117,61 +127,60 @@ torch::Tensor dispatchScaledDotProductAttention<torch::kCUDA>(const torch::Tenso
 
     // Validate graph
     auto validate_status = graph->validate();
-    TORCH_CHECK(validate_status.is_good(), std::string("Graph validation failed: ") + validate_status.get_message());
+    TORCH_CHECK(validate_status.is_good(),
+                std::string("Graph validation failed: ") + validate_status.get_message());
 
     // Build execution engine (pytorch <= 2.2.1 use cudnn 8930 that does not support ragged offset)
     // cudnnHandle_t handle = at::native::getCudnnHandle();
     // We use our own linked cudnn 9.0
     cudnnHandle_t handle;
-    auto create_status = cudnnCreate(&handle);
-    TORCH_CHECK(create_status == CUDNN_STATUS_SUCCESS, std::string("CUDNN handle creation failed: ") + std::to_string(create_status));
+    auto          create_status = cudnnCreate(&handle);
+    TORCH_CHECK(create_status == CUDNN_STATUS_SUCCESS,
+                std::string("CUDNN handle creation failed: ") + std::to_string(create_status));
 
     std::cout << "CUDNN Version: " << cudnnGetVersion() << std::endl;
 
     auto build_status = graph->build_operation_graph(handle);
-    TORCH_CHECK(build_status.is_good(), std::string("Graph build failed: ") + build_status.get_message());
+    TORCH_CHECK(build_status.is_good(),
+                std::string("Graph build failed: ") + build_status.get_message());
 
-    auto plans = graph->create_execution_plans({fe::HeurMode_t::A});
+    auto plans = graph->create_execution_plans({ fe::HeurMode_t::A });
     TORCH_CHECK(graph->check_support(handle).is_good(), "Graph support check failed");
     TORCH_CHECK(graph->build_plans(handle).is_good(), "Graph plan build failed");
 
     // Build output tensor
-    torch::Tensor output = torch::empty(
-        {num_batch, num_heads, num_sq, num_v_feat},
-        torch::TensorOptions().dtype(query.dtype()).device(query.device()));
+    torch::Tensor output =
+        torch::empty({ num_batch, num_heads, num_sq, num_v_feat },
+                     torch::TensorOptions().dtype(query.dtype()).device(query.device()));
     torch::Tensor statsTensor;
     if (training) {
-        statsTensor = torch::empty(
-            {num_batch, num_heads, num_sq, 1},
-            torch::TensorOptions().dtype(torch::kFloat32).device(query.device()));
+        statsTensor =
+            torch::empty({ num_batch, num_heads, num_sq, 1 },
+                         torch::TensorOptions().dtype(torch::kFloat32).device(query.device()));
     }
 
     // Build variant pack
-    torch::Tensor seqLenQ = qLengths.to(torch::kInt32);
+    torch::Tensor seqLenQ  = qLengths.to(torch::kInt32);
     torch::Tensor seqLenKV = kvLengths.to(torch::kInt32);
     torch::Tensor offsetQ = exclusivePrefixSum(seqLenQ * num_heads * num_qk_feat).to(torch::kInt32);
-    torch::Tensor offsetK = exclusivePrefixSum(seqLenKV * num_heads * num_qk_feat).to(torch::kInt32);
+    torch::Tensor offsetK =
+        exclusivePrefixSum(seqLenKV * num_heads * num_qk_feat).to(torch::kInt32);
     torch::Tensor offsetV = exclusivePrefixSum(seqLenKV * num_heads * num_v_feat).to(torch::kInt32);
     torch::Tensor offsetO = exclusivePrefixSum(seqLenQ * num_heads * num_v_feat).to(torch::kInt32);
-    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void*> variant_pack = {
-        {qNode, (char*) query.data_ptr()},
-        {kNode, (char*) key.data_ptr()},
-        {vNode, (char*) value.data_ptr()},
-        {qLenNode, (char*) seqLenQ.data_ptr()},
-        {kvLenNode, (char*) seqLenKV.data_ptr()},
-        {qOffsetNode, (char*) offsetQ.data_ptr()},
-        {kOffsetNode, (char*) offsetK.data_ptr()},
-        {vOffsetNode, (char*) offsetV.data_ptr()},
-        {oOffsetNode, (char*) offsetO.data_ptr()},
-        {O, (char*) output.data_ptr()}
+    std::unordered_map<std::shared_ptr<fe::graph::Tensor_attributes>, void *> variant_pack = {
+        { qNode, (char *)query.data_ptr() },         { kNode, (char *)key.data_ptr() },
+        { vNode, (char *)value.data_ptr() },         { qLenNode, (char *)seqLenQ.data_ptr() },
+        { kvLenNode, (char *)seqLenKV.data_ptr() },  { qOffsetNode, (char *)offsetQ.data_ptr() },
+        { kOffsetNode, (char *)offsetK.data_ptr() }, { vOffsetNode, (char *)offsetV.data_ptr() },
+        { oOffsetNode, (char *)offsetO.data_ptr() }, { O, (char *)output.data_ptr() }
     };
     if (training) {
-        variant_pack[stats] = (char*) statsTensor.data_ptr();
+        variant_pack[stats] = (char *)statsTensor.data_ptr();
     }
 
     // Build workspace
     auto workspace_size = graph->get_workspace_size();
-    auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
+    auto workspace_ptr  = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size);
     TORCH_CHECK(graph->execute(handle, variant_pack, workspace_ptr.get()).is_good());
 
     // Destory handle
@@ -185,20 +194,16 @@ torch::Tensor dispatchScaledDotProductAttention<torch::kCUDA>(const torch::Tenso
 #endif
 }
 
-
 template <>
-torch::Tensor dispatchScaledDotProductAttention<torch::kCPU>(const torch::Tensor& query,
-                                                              const torch::Tensor& key,
-                                                              const torch::Tensor& value,
-                                                              const torch::Tensor& qLengths,
-                                                              const torch::Tensor& kvLengths,
-                                                              bool training,
-                                                              float scale) {
+torch::Tensor
+dispatchScaledDotProductAttention<torch::kCPU>(const torch::Tensor &query, const torch::Tensor &key,
+                                               const torch::Tensor &value,
+                                               const torch::Tensor &qLengths,
+                                               const torch::Tensor &kvLengths, bool training,
+                                               float scale) {
     TORCH_CHECK(false, "CPU implementation not available");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/SegmentsAlongRays.cu b/fvdb/src/detail/ops/SegmentsAlongRays.cu
index f972c748a3..b2d5d750eb 100644
--- a/fvdb/src/detail/ops/SegmentsAlongRays.cu
+++ b/fvdb/src/detail/ops/SegmentsAlongRays.cu
@@ -1,52 +1,55 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void segmentsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
-                                           const JaggedAccessor<ScalarType, 2> rayOrigins,
-                                           const JaggedAccessor<ScalarType, 2> rayDirections,
-                                           const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets,
-                                           TensorAccessor<fvdb::JIdxType, 1> outJIdx,
-                                           TensorAccessor<fvdb::JLIdxType, 2> outJLIdx,
-                                           TensorAccessor<ScalarType, 2> outSegments,
-                                           GridBatchImpl::Accessor<GridType> batchAccessor,
-                                           int64_t maxSegments,
-                                           ScalarType eps,
-                                           bool ignoreMasked) {
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-
-    const VoxelCoordTransform& transform = batchAccessor.dualTransform(bidx);
-    nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
-
-    const auto& rayO = rayOrigins.data()[rayIdx];
-    const auto& rayD = rayDirections.data()[rayIdx];
-    nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+segmentsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
+                          const JaggedAccessor<ScalarType, 2>         rayOrigins,
+                          const JaggedAccessor<ScalarType, 2>         rayDirections,
+                          const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets,
+                          TensorAccessor<fvdb::JIdxType, 1>           outJIdx,
+                          TensorAccessor<fvdb::JLIdxType, 2>          outJLIdx,
+                          TensorAccessor<ScalarType, 2>               outSegments,
+                          GridBatchImpl::Accessor<GridType> batchAccessor, int64_t maxSegments,
+                          ScalarType eps, bool ignoreMasked) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
+
+    const VoxelCoordTransform &transform = batchAccessor.dualTransform(bidx);
+    nanovdb::CoordBBox         dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                       gridAcc   = gpuGrid->getAccessor();
+
+    const auto                    &rayO = rayOrigins.data()[rayIdx];
+    const auto                    &rayD = rayDirections.data()[rayIdx];
+    nanovdb::math::Ray<ScalarType> rayVox =
+        transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
 
     if (outJLIdx.size(0) > 0) {
         const fvdb::JLIdxType batchStartIdx = rayOrigins.offsetStart(bidx);
-        outJLIdx[rayIdx][0] = bidx;
-        outJLIdx[rayIdx][1] = rayIdx - batchStartIdx;
+        outJLIdx[rayIdx][0]                 = bidx;
+        outJLIdx[rayIdx][1]                 = rayIdx - batchStartIdx;
     }
 
     if (!rayVox.clip(dualBbox)) {
         return;
     }
 
-    fvdb::JOffsetsType numSegments = 0;
-    const fvdb::JOffsetsType startIdx = outJOffsets[rayIdx];
-    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, ignoreMasked); it.isValid(); ++it) {
-        const ScalarType t0 = it->t0;
-        const ScalarType t1 = it->t1;
+    fvdb::JOffsetsType       numSegments = 0;
+    const fvdb::JOffsetsType startIdx    = outJOffsets[rayIdx];
+    for (auto it =
+             HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, ignoreMasked);
+         it.isValid(); ++it) {
+        const ScalarType t0     = it->t0;
+        const ScalarType t1     = it->t1;
         const ScalarType deltaT = t1 - t0;
 
         if (deltaT < eps) {
@@ -55,7 +58,7 @@ __hostdev__ void segmentsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
 
         outSegments[startIdx + numSegments][0] = t0;
         outSegments[startIdx + numSegments][1] = t1;
-        outJIdx[startIdx + numSegments] = rayIdx;
+        outJIdx[startIdx + numSegments]        = rayIdx;
 
         numSegments += 1;
         if (numSegments == maxSegments) {
@@ -65,35 +68,38 @@ __hostdev__ void segmentsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
     // assert(numSegments == outJOffsets[eidx][1]);
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void countSegmentsAlongRaysCallback(int32_t bidx, int32_t eidx,
-                                                const JaggedAccessor<ScalarType, 2> rayOrigins,     // [B*M, 3]
-                                                const JaggedAccessor<ScalarType, 2> rayDirections,  // [B*M, 3]
-                                                TensorAccessor<int32_t, 1> outCounts,               // [B*M]
-                                                BatchGridAccessor<GridType> batchAccessor,
-                                                int64_t maxSegments,
-                                                ScalarType eps,
-                                                bool ignoreMasked) {
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-
-    const VoxelCoordTransform& transform = batchAccessor.dualTransform(bidx);
-    nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
-
-    const auto& rayO = rayOrigins.data()[eidx];
-    const auto& rayD = rayDirections.data()[eidx];
-    nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+countSegmentsAlongRaysCallback(int32_t bidx, int32_t eidx,
+                               const JaggedAccessor<ScalarType, 2> rayOrigins,    // [B*M, 3]
+                               const JaggedAccessor<ScalarType, 2> rayDirections, // [B*M, 3]
+                               TensorAccessor<int32_t, 1>          outCounts,     // [B*M]
+                               BatchGridAccessor<GridType> batchAccessor, int64_t maxSegments,
+                               ScalarType eps, bool ignoreMasked) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
+
+    const VoxelCoordTransform &transform = batchAccessor.dualTransform(bidx);
+    nanovdb::CoordBBox         dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                       gridAcc   = gpuGrid->getAccessor();
+
+    const auto                    &rayO = rayOrigins.data()[eidx];
+    const auto                    &rayD = rayDirections.data()[eidx];
+    nanovdb::math::Ray<ScalarType> rayVox =
+        transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
 
     if (!rayVox.clip(dualBbox)) {
-        outCounts[eidx+1] = 0;
+        outCounts[eidx + 1] = 0;
         return;
     }
 
     int32_t numSegments = 0;
-    for (auto it = HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, ignoreMasked); it.isValid(); ++it) {
-        const ScalarType t0 = it->t0;
-        const ScalarType t1 = it->t1;
+    for (auto it =
+             HDDASegmentIterator<decltype(gridAcc), ScalarType>(rayVox, gridAcc, ignoreMasked);
+         it.isValid(); ++it) {
+        const ScalarType t0     = it->t0;
+        const ScalarType t1     = it->t1;
         const ScalarType deltaT = t1 - t0;
 
         if (deltaT < eps) {
@@ -105,143 +111,157 @@ __hostdev__ void countSegmentsAlongRaysCallback(int32_t bidx, int32_t eidx,
             break;
         }
     }
-    outCounts[eidx+1] = numSegments;
+    outCounts[eidx + 1] = numSegments;
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor SegmentsAlongRays(const GridBatchImpl& batchHdl,
-                               const JaggedTensor& rayOrigins,
-                               const JaggedTensor& rayDirections,
-                               int64_t maxSegments,
-                               const double eps,
-                               const bool ignoreMasked) {
-
+JaggedTensor
+SegmentsAlongRays(const GridBatchImpl &batchHdl, const JaggedTensor &rayOrigins,
+                  const JaggedTensor &rayDirections, int64_t maxSegments, const double eps,
+                  const bool ignoreMasked) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(rayOrigins);
     batchHdl.checkDevice(rayDirections);
     TORCH_CHECK_VALUE(rayOrigins.rsize(1) == 3, "ray_origins must have shape (n, 3)");
     TORCH_CHECK_VALUE(rayDirections.rsize(1) == 3, "ray_directions must have shape (n, 3)");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(), "ray_origins must have the same batch size as the grid batch");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(), "ray_directions must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(),
+                      "ray_origins must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(),
+                      "ray_directions must have the same batch size as the grid batch");
     TORCH_CHECK_TYPE(rayOrigins.is_floating_point(), "ray_origins must have a floating point type");
-    TORCH_CHECK_TYPE(rayDirections.is_floating_point(), "ray_directions must have a floating point type");
-    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(), "all tensors must have the same type");
-    TORCH_CHECK_VALUE(rayOrigins.rdim() == 2, std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
-                      std::to_string(rayOrigins.rdim()) + " dimensions");
-    TORCH_CHECK_VALUE(rayDirections.rdim() == 2, std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
-                      std::to_string(rayDirections.rdim()) + " dimensions");
-    TORCH_CHECK_VALUE(rayOrigins.rsize(0) == rayDirections.rsize(0),
-                      "ray_origins and ray_directions must have the same size in dimension 0 but got " +
-                      std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(rayDirections.rsize(0)));
+    TORCH_CHECK_TYPE(rayDirections.is_floating_point(),
+                     "ray_directions must have a floating point type");
+    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(),
+                     "all tensors must have the same type");
+    TORCH_CHECK_VALUE(
+        rayOrigins.rdim() == 2,
+        std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayOrigins.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        rayDirections.rdim() == 2,
+        std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayDirections.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        rayOrigins.rsize(0) == rayDirections.rsize(0),
+        "ray_origins and ray_directions must have the same size in dimension 0 but got " +
+            std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(rayDirections.rsize(0)));
     TORCH_CHECK_VALUE(eps >= 0.0, "eps must be positive or zero");
-    TORCH_CHECK_VALUE(maxSegments > 0 || maxSegments == -1, "max_segments must be greater than zero or -1");
+    TORCH_CHECK_VALUE(maxSegments > 0 || maxSegments == -1,
+                      "max_segments must be greater than zero or -1");
     TORCH_CHECK_VALUE(rayOrigins.ldim() == 1, "Invalid list dimension for ray origins.");
     TORCH_CHECK_VALUE(rayDirections.ldim() == 1, "Invalid list dimension for ray directions.");
 
     return FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() -> JaggedTensor {
-        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(rayOrigins.scalar_type(), "SegmentsAlongRays", [&]() -> JaggedTensor {
-
-            int64_t numThreads = 384;
-            if constexpr (nanovdb::util::is_same<scalar_t, double>::value) {
-                numThreads = 256;
-            }
-            const auto optsF = torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
-            const auto optsI32 = torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
-            const auto optsJIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
-            const auto optsJOffsets = torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(rayOrigins.device());
-            const auto optsJLIdx = torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
-
-            // Count number of segments along each ray
-            torch::Tensor rayCounts = torch::zeros({rayOrigins.rsize(0) + 1}, optsI32);  // [B*M]
-            auto outCountsAcc = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb1 = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rOA) {
-                    countSegmentsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxSegments, eps, ignoreMasked);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb1);
-            } else {
-                auto cb1 = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rOA) {
-                    countSegmentsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxSegments, eps, ignoreMasked);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb1);
-            }
-
-            // Compute joffsets for output
-            const torch::Tensor outJOffsets = rayCounts.cumsum(0, fvdb::JOffsetsScalarType);  // [B*M]
-            const fvdb::JOffsetsType totalSegments = outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
-
-            // Allocate output JaggedTensor indexing data
-            torch::Tensor outJLidx = torch::empty({outJOffsets.size(0) - 1, 2}, optsJLIdx);    // [total_rays, 2]
-            torch::Tensor outJIdx = torch::zeros({totalSegments}, optsJIdx); // [total_intersections]
-
-            // Allocate output jdata tensors
-            torch::Tensor outSegments = torch::zeros({totalSegments, 2}, optsF);   // [B*M*S, 2]
-
-            // Compute output voxels and times
-            const auto outJOffsetsAcc = tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
-            const auto outJIdxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
-            const auto outJLIdxAcc = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
-
-            const auto outSegmentsAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outSegments);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb2 = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
-                    segmentsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outSegmentsAcc,
-                        batchAcc,
-                        maxSegments, eps, ignoreMasked);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb2);
-            } else {
-                auto cb2 = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rayOriginsAcc) {
-                    segmentsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outSegmentsAcc,
-                        batchAcc,
-                        maxSegments, eps, ignoreMasked);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb2);
-            }
-
-            return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-                outSegments, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
-        });
+        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            rayOrigins.scalar_type(), "SegmentsAlongRays", [&]() -> JaggedTensor {
+                int64_t numThreads = 384;
+                if constexpr (nanovdb::util::is_same<scalar_t, double>::value) {
+                    numThreads = 256;
+                }
+                const auto optsF =
+                    torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
+                const auto optsI32 =
+                    torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
+                const auto optsJIdx =
+                    torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
+                const auto optsJOffsets = torch::TensorOptions()
+                                              .dtype(fvdb::JOffsetsScalarType)
+                                              .device(rayOrigins.device());
+                const auto optsJLIdx =
+                    torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
+
+                // Count number of segments along each ray
+                torch::Tensor rayCounts =
+                    torch::zeros({ rayOrigins.rsize(0) + 1 }, optsI32); // [B*M]
+                auto outCountsAcc     = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
+                auto batchAcc         = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb1 = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                              JaggedRAcc32<scalar_t, 2> rOA) {
+                        countSegmentsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32,
+                                                       TorchRAcc32>(
+                            bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxSegments,
+                            eps, ignoreMasked);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb1);
+                } else {
+                    auto cb1 = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                   JaggedAcc<scalar_t, 2> rOA) {
+                        countSegmentsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxSegments,
+                            eps, ignoreMasked);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb1);
+                }
+
+                // Compute joffsets for output
+                const torch::Tensor outJOffsets =
+                    rayCounts.cumsum(0, fvdb::JOffsetsScalarType); // [B*M]
+                const fvdb::JOffsetsType totalSegments =
+                    outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
+
+                // Allocate output JaggedTensor indexing data
+                torch::Tensor outJLidx =
+                    torch::empty({ outJOffsets.size(0) - 1, 2 }, optsJLIdx); // [total_rays, 2]
+                torch::Tensor outJIdx =
+                    torch::zeros({ totalSegments }, optsJIdx); // [total_intersections]
+
+                // Allocate output jdata tensors
+                torch::Tensor outSegments = torch::zeros({ totalSegments, 2 }, optsF); // [B*M*S, 2]
+
+                // Compute output voxels and times
+                const auto outJOffsetsAcc =
+                    tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
+                const auto outJIdxAcc  = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
+                const auto outJLIdxAcc = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
+
+                const auto outSegmentsAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outSegments);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb2 = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                              JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
+                        segmentsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outSegmentsAcc, batchAcc, maxSegments, eps, ignoreMasked);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb2);
+                } else {
+                    auto cb2 = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                   JaggedAcc<scalar_t, 2> rayOriginsAcc) {
+                        segmentsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outSegmentsAcc, batchAcc, maxSegments, eps, ignoreMasked);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb2);
+                }
+
+                return JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+                    outSegments, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
+            });
     });
 }
 
-
-
 template <>
-JaggedTensor dispatchSegmentsAlongRays<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                     const JaggedTensor& rayOrigins,
-                                                     const JaggedTensor& rayDirections,
-                                                     int64_t maxSegments,
-                                                     const double eps,
-                                                     const bool ignoreMasked) {
-    return SegmentsAlongRays<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, maxSegments, eps, ignoreMasked);
+JaggedTensor
+dispatchSegmentsAlongRays<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                        const JaggedTensor  &rayOrigins,
+                                        const JaggedTensor &rayDirections, int64_t maxSegments,
+                                        const double eps, const bool ignoreMasked) {
+    return SegmentsAlongRays<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, maxSegments, eps,
+                                           ignoreMasked);
 }
 
 template <>
-JaggedTensor dispatchSegmentsAlongRays<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                    const JaggedTensor& rayOrigins,
-                                                    const JaggedTensor& rayDirections,
-                                                    int64_t maxSegments,
-                                                    const double eps,
-                                                    const bool ignoreMasked) {
-    return SegmentsAlongRays<torch::kCPU>(batchHdl, rayOrigins, rayDirections, maxSegments, eps, ignoreMasked);
+JaggedTensor
+dispatchSegmentsAlongRays<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                       const JaggedTensor  &rayOrigins,
+                                       const JaggedTensor &rayDirections, int64_t maxSegments,
+                                       const double eps, const bool ignoreMasked) {
+    return SegmentsAlongRays<torch::kCPU>(batchHdl, rayOrigins, rayDirections, maxSegments, eps,
+                                          ignoreMasked);
 }
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/SetMasked.cu b/fvdb/src/detail/ops/SetMasked.cu
index 098bb2366e..14b885fdc5 100644
--- a/fvdb/src/detail/ops/SetMasked.cu
+++ b/fvdb/src/detail/ops/SetMasked.cu
@@ -1,50 +1,47 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/nanovdb/CustomAccessors.h"
+#include <detail/utils/nanovdb/CustomAccessors.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, c10::DeviceType DeviceTag, template <typename T, int32_t D> typename JaggedAccessor>
-__hostdev__ inline void setMaskedIjkCallback(int32_t bidx, int32_t eidx,
-                                             JaggedAccessor<ScalarType, 2> coords,
-                                             BatchGridAccessor<nanovdb::ValueOnIndexMask> batchAccessor,
-                                             bool maskedState) {
-    const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask>* gpuGrid = batchAccessor.grid(bidx);
-    auto acc = gpuGrid->getAccessor();
+template <typename ScalarType, c10::DeviceType DeviceTag,
+          template <typename T, int32_t D> typename JaggedAccessor>
+__hostdev__ inline void
+setMaskedIjkCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> coords,
+                     BatchGridAccessor<nanovdb::ValueOnIndexMask> batchAccessor, bool maskedState) {
+    const nanovdb::NanoGrid<nanovdb::ValueOnIndexMask> *gpuGrid = batchAccessor.grid(bidx);
+    auto                                                acc     = gpuGrid->getAccessor();
 
-    const auto coord = coords.data()[eidx];
+    const auto           coord = coords.data()[eidx];
     const nanovdb::Coord ijk(coord[0], coord[1], coord[2]);
 
     if constexpr (DeviceTag == torch::kCUDA) {
-        acc.template set<typename fvdb::AtomicMaskedStateSetOnlyDevice>(ijk, maskedState); // false means we disable voxels
+        acc.template set<typename fvdb::AtomicMaskedStateSetOnlyDevice>(
+            ijk, maskedState); // false means we disable voxels
     } else {
-        acc.template set<typename fvdb::AtomicMaskedStateSetOnlyHost>(ijk, maskedState); // false means we disable voxels
+        acc.template set<typename fvdb::AtomicMaskedStateSetOnlyHost>(
+            ijk, maskedState); // false means we disable voxels
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-void SetMaskedIjk(const GridBatchImpl& batchHdl,
-                  const JaggedTensor& ijk,
-                  bool maskedState) {
+void
+SetMaskedIjk(const GridBatchImpl &batchHdl, const JaggedTensor &ijk, bool maskedState) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(ijk);
     TORCH_CHECK(batchHdl.isMutable(), "Cannot disable voxels in an immutable grid");
-    TORCH_CHECK_VALUE(ijk.rdim() == 2,
-                      "Expected 2 dimensional ijk with shape (N, 3) but got = " +
-                      std::to_string(ijk.jdata().dim()) + " dimensional ijk.");
-    TORCH_CHECK_VALUE(ijk.rsize(1) == 3,
-                      "Expected 3 dimensional ijk but got points.shape[1] = " +
-                      std::to_string(ijk.rsize(1)));
-    if(ijk.rsize(0) == 0) {
+    TORCH_CHECK_VALUE(ijk.rdim() == 2, "Expected 2 dimensional ijk with shape (N, 3) but got = " +
+                                           std::to_string(ijk.jdata().dim()) + " dimensional ijk.");
+    TORCH_CHECK_VALUE(ijk.rsize(1) == 3, "Expected 3 dimensional ijk but got points.shape[1] = " +
+                                             std::to_string(ijk.rsize(1)));
+    if (ijk.rsize(0) == 0) {
         return; // nothing to do
     }
     TORCH_CHECK(ijk.rsize(0) > 0, "Empty tensor (ijk)");
@@ -53,28 +50,33 @@ void SetMaskedIjk(const GridBatchImpl& batchHdl,
         auto batchAcc = gridBatchAccessor<DeviceTag, nanovdb::ValueOnIndexMask>(batchHdl);
 
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> cA) {
-                setMaskedIjkCallback<scalar_t, DeviceTag, JaggedRAcc32>(bidx, eidx, cA, batchAcc, maskedState);
+            auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                     JaggedRAcc32<scalar_t, 2> cA) {
+                setMaskedIjkCallback<scalar_t, DeviceTag, JaggedRAcc32>(bidx, eidx, cA, batchAcc,
+                                                                        maskedState);
             };
             forEachJaggedElementChannelCUDA<scalar_t, 2>(1024, 1, ijk, cb);
         } else {
-            auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> cA) {
-                setMaskedIjkCallback<scalar_t, DeviceTag, JaggedAcc>(bidx, eidx, cA, batchAcc, maskedState);
+            auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> cA) {
+                setMaskedIjkCallback<scalar_t, DeviceTag, JaggedAcc>(bidx, eidx, cA, batchAcc,
+                                                                     maskedState);
             };
             forEachJaggedElementChannelCPU<scalar_t, 2>(1, ijk, cb);
         }
     });
 }
 
-
-
 template <>
-void dispatchSetMaskedIjk<torch::kCUDA>(const GridBatchImpl& batchHdl, const JaggedTensor& coords, bool maskedState) {
+void
+dispatchSetMaskedIjk<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                   bool maskedState) {
     SetMaskedIjk<torch::kCUDA>(batchHdl, coords, maskedState);
 }
 
 template <>
-void dispatchSetMaskedIjk<torch::kCPU>(const GridBatchImpl& batchHdl, const JaggedTensor& coords, bool maskedState) {
+void
+dispatchSetMaskedIjk<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                  bool maskedState) {
     SetMaskedIjk<torch::kCPU>(batchHdl, coords, maskedState);
 }
 
diff --git a/fvdb/src/detail/ops/SplatIntoGridBezier.cu b/fvdb/src/detail/ops/SplatIntoGridBezier.cu
index 43b7aaddc3..8d0c345d83 100644
--- a/fvdb/src/detail/ops/SplatIntoGridBezier.cu
+++ b/fvdb/src/detail/ops/SplatIntoGridBezier.cu
@@ -1,41 +1,41 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/BezierInterpolationIterator.h"
+#include <detail/utils/BezierInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void splatIntoGridBezierCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                             JaggedAccessor<ScalarType, 2> points,
-                                             TensorAccessor<ScalarType, 2> pointsData,
-                                             BatchGridAccessor<GridType> batchAccessor,
-                                             TensorAccessor<at::opmath_type<ScalarType>, 2> outGridData) {
+template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+splatIntoGridBezierCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                            JaggedAccessor<ScalarType, 2>                  points,
+                            TensorAccessor<ScalarType, 2>                  pointsData,
+                            BatchGridAccessor<GridType>                    batchAccessor,
+                            TensorAccessor<at::opmath_type<ScalarType>, 2> outGridData) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto pointCoords = points.data();
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
+    const auto                         pointCoords = points.data();
+    const nanovdb::NanoGrid<GridType> *gpuGrid     = batchAccessor.grid(bidx);
+    auto                               gridAcc     = gpuGrid->getAccessor();
+    const int64_t                      baseOffset  = batchAccessor.voxelOffset(bidx);
+    const VoxelCoordTransform         &transform   = batchAccessor.primalTransform(bidx);
 
     const nanovdb::math::Vec3<MathType> xyz = transform.apply(
-        static_cast<MathType>(pointCoords[eidx][0]),
-        static_cast<MathType>(pointCoords[eidx][1]),
-        static_cast<MathType>(pointCoords[eidx][2])
-    );
+        static_cast<MathType>(pointCoords[eidx][0]), static_cast<MathType>(pointCoords[eidx][1]),
+        static_cast<MathType>(pointCoords[eidx][2]));
 
-    #pragma unroll
+#pragma unroll
     for (auto it = BezierInterpolationIterator<MathType>(xyz); it.isValid(); ++it) {
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(it->first)) {
-            const int64_t indexIjk = (int64_t) gridAcc.getValue(it->first) - 1 + baseOffset;
+            const int64_t  indexIjk = (int64_t)gridAcc.getValue(it->first) - 1 + baseOffset;
             const MathType addValue = it->second * static_cast<MathType>(pointsData[eidx][cidx]);
             if constexpr (DeviceTag == torch::kCUDA) {
                 gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx], addValue);
@@ -47,74 +47,76 @@ __hostdev__ void splatIntoGridBezierCallback(int32_t bidx, int32_t eidx, int32_t
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor SplatIntoGridBezier(const GridBatchImpl& batchHdl,
-                                  const JaggedTensor& points,
-                                  const torch::Tensor& pointsData) {
+torch::Tensor
+SplatIntoGridBezier(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                    const torch::Tensor &pointsData) {
+    int64_t       numOutputValues = batchHdl.totalVoxels();
+    auto          opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
+    torch::Tensor outGridData =
+        torch::zeros(spliceShape({ numOutputValues }, pointsData, 1), opts); // [N, *]
 
-    int64_t numOutputValues = batchHdl.totalVoxels();
-    auto opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
-    torch::Tensor outGridData = torch::zeros(spliceShape({numOutputValues}, pointsData, 1), opts); // [N, *]
-
-    torch::Tensor pointsDataReshape = featureCoalescedView(pointsData);                     // [B*M, -1]
-    torch::Tensor outGridDataReshape = featureCoalescedView(outGridData);        // [N, -1]
+    torch::Tensor pointsDataReshape  = featureCoalescedView(pointsData);     // [B*M, -1]
+    torch::Tensor outGridDataReshape = featureCoalescedView(outGridData);    // [N, -1]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SplatIntoGridBezier", ([&] {
-
-            torch::Tensor _outGridData;
-            if (points.scalar_type() == at::kHalf) {
-                _outGridData = torch::zeros_like(outGridDataReshape, outGridData.options().dtype(torch::kFloat32));
-            } else {
-                _outGridData = outGridDataReshape;
-            }
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto pointsDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(pointsData);
-            auto outGridDataAcc = tensorAccessor<DeviceTag, at::opmath_type<scalar_t>, 2>(_outGridData);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    splatIntoGridBezierCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, ptsA, pointsDataAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, pointsData.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    splatIntoGridBezierCallback<DeviceTag, scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, ptsA, pointsDataAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(pointsData.size(1), points, cb);
-            }
-
-            if (points.scalar_type() == at::kHalf) {
-                outGridData.copy_(_outGridData);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SplatIntoGridBezier", ([&] {
+                torch::Tensor _outGridData;
+                if (points.scalar_type() == at::kHalf) {
+                    _outGridData = torch::zeros_like(outGridDataReshape,
+                                                     outGridData.options().dtype(torch::kFloat32));
+                } else {
+                    _outGridData = outGridDataReshape;
+                }
+
+                auto batchAcc      = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto pointsDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(pointsData);
+                auto outGridDataAcc =
+                    tensorAccessor<DeviceTag, at::opmath_type<scalar_t>, 2>(_outGridData);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> ptsA) {
+                        splatIntoGridBezierCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32,
+                                                    TorchRAcc32>(
+                            bidx, eidx, cidx, ptsA, pointsDataAcc, batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, pointsData.size(1), points,
+                                                                 cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> ptsA) {
+                        splatIntoGridBezierCallback<DeviceTag, scalar_t, GridType, JaggedAcc,
+                                                    TorchAcc>(bidx, eidx, cidx, ptsA, pointsDataAcc,
+                                                              batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(pointsData.size(1), points, cb);
+                }
+
+                if (points.scalar_type() == at::kHalf) {
+                    outGridData.copy_(_outGridData);
+                }
+            }));
     });
 
     return outGridData;
 }
 
-
-
-
 template <>
-torch::Tensor dispatchSplatIntoGridBezier<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                      const JaggedTensor& points,
-                                                      const torch::Tensor& pointsData) {
+torch::Tensor
+dispatchSplatIntoGridBezier<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                          const torch::Tensor &pointsData) {
     return SplatIntoGridBezier<torch::kCUDA>(batchHdl, points, pointsData);
 }
 
-
 template <>
-torch::Tensor dispatchSplatIntoGridBezier<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                     const JaggedTensor& points,
-                                                     const torch::Tensor& pointsData) {
+torch::Tensor
+dispatchSplatIntoGridBezier<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                                         const torch::Tensor &pointsData) {
     return SplatIntoGridBezier<torch::kCPU>(batchHdl, points, pointsData);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/SplatIntoGridTrilinear.cu b/fvdb/src/detail/ops/SplatIntoGridTrilinear.cu
index 8e7b7b56c0..abded3666c 100644
--- a/fvdb/src/detail/ops/SplatIntoGridTrilinear.cu
+++ b/fvdb/src/detail/ops/SplatIntoGridTrilinear.cu
@@ -1,42 +1,43 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/TrilinearInterpolationIterator.h"
+#include <detail/utils/TrilinearInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void splatIntoGridTrilinearCallback(int32_t bidx, int32_t eidx, int32_t cidx,
-                                                JaggedAccessor<ScalarType, 2> points,
-                                                TensorAccessor<ScalarType, 2> pointsData,
-                                                BatchGridAccessor<GridType> batchAccessor,
-                                                TensorAccessor<at::opmath_type<ScalarType>, 2> outGridData) {
+template <c10::DeviceType DeviceTag, typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+splatIntoGridTrilinearCallback(int32_t bidx, int32_t eidx, int32_t cidx,
+                               JaggedAccessor<ScalarType, 2>                  points,
+                               TensorAccessor<ScalarType, 2>                  pointsData,
+                               BatchGridAccessor<GridType>                    batchAccessor,
+                               TensorAccessor<at::opmath_type<ScalarType>, 2> outGridData) {
     using MathType = at::opmath_type<ScalarType>;
 
-    const auto& pointCoordData = points.data();
+    const auto &pointCoordData = points.data();
 
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    auto gridAcc = gpuGrid->getAccessor();
-    const int64_t baseOffset = batchAccessor.voxelOffset(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.primalTransform(bidx);
+    const nanovdb::NanoGrid<GridType> *gpuGrid    = batchAccessor.grid(bidx);
+    auto                               gridAcc    = gpuGrid->getAccessor();
+    const int64_t                      baseOffset = batchAccessor.voxelOffset(bidx);
+    const VoxelCoordTransform         &transform  = batchAccessor.primalTransform(bidx);
 
-    const nanovdb::math::Vec3<MathType> xyz = transform.apply(
-        static_cast<MathType>(pointCoordData[eidx][0]),
-        static_cast<MathType>(pointCoordData[eidx][1]),
-        static_cast<MathType>(pointCoordData[eidx][2])
-    );
+    const nanovdb::math::Vec3<MathType> xyz =
+        transform.apply(static_cast<MathType>(pointCoordData[eidx][0]),
+                        static_cast<MathType>(pointCoordData[eidx][1]),
+                        static_cast<MathType>(pointCoordData[eidx][2]));
 
-    #pragma unroll
+#pragma unroll
     for (auto it = TrilinearInterpolationIterator<MathType>(xyz); it.isValid(); ++it) {
         if (gridAcc.template get<ActiveOrUnmasked<GridType>>(it->first)) {
-            const int64_t indexIjk = gridAcc.getValue(it->first) - 1 + baseOffset;
+            const int64_t  indexIjk = gridAcc.getValue(it->first) - 1 + baseOffset;
             const MathType addValue = it->second * static_cast<MathType>(pointsData[eidx][cidx]);
             if constexpr (DeviceTag == torch::kCUDA) {
                 gpuAtomicAddNoReturn(&outGridData[indexIjk][cidx], addValue);
@@ -47,64 +48,74 @@ __hostdev__ void splatIntoGridTrilinearCallback(int32_t bidx, int32_t eidx, int3
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor SplatIntoGridTrilinear(const GridBatchImpl& batchHdl,
-                                     const JaggedTensor& points,
-                                     const torch::Tensor& pointsData) {
-    int64_t numOutputValues = batchHdl.totalVoxels();
-    auto opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
-    torch::Tensor outGridData = torch::zeros(spliceShape({numOutputValues}, pointsData, 1), opts); // [N, *]
+torch::Tensor
+SplatIntoGridTrilinear(const GridBatchImpl &batchHdl, const JaggedTensor &points,
+                       const torch::Tensor &pointsData) {
+    int64_t       numOutputValues = batchHdl.totalVoxels();
+    auto          opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
+    torch::Tensor outGridData =
+        torch::zeros(spliceShape({ numOutputValues }, pointsData, 1), opts); // [N, *]
 
-    torch::Tensor pointsDataReshape = featureCoalescedView(pointsData);                     // [B*M, -1]
-    torch::Tensor outGridDataReshape = featureCoalescedView(outGridData);        // [N, -1]
+    torch::Tensor pointsDataReshape  = featureCoalescedView(pointsData);     // [B*M, -1]
+    torch::Tensor outGridDataReshape = featureCoalescedView(outGridData);    // [N, -1]
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
-        AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "SplatIntoGridTrilinear", ([&] {
-            torch::Tensor _outGridData;
-            if (points.scalar_type() == at::kHalf) {
-                _outGridData = torch::zeros_like(outGridData, outGridData.options().dtype(torch::kFloat32));
-            } else {
-                _outGridData = outGridData;
-            }
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto pointsDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(pointsData);
-            auto outGridDataAcc = tensorAccessor<DeviceTag, at::opmath_type<scalar_t>, 2>(_outGridData);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pts) {
-                    splatIntoGridTrilinearCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, cidx, pts, pointsDataAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(256, pointsData.size(1), points, cb);
-            } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> pts) {
-                    splatIntoGridTrilinearCallback<DeviceTag, scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, cidx, pts, pointsDataAcc, batchAcc, outGridDataAcc);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(pointsData.size(1), points, cb);
-            }
-
-            if (points.scalar_type() == at::kHalf) {
-                outGridData.copy_(_outGridData);
-            }
-        }));
+        AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            points.scalar_type(), "SplatIntoGridTrilinear", ([&] {
+                torch::Tensor _outGridData;
+                if (points.scalar_type() == at::kHalf) {
+                    _outGridData = torch::zeros_like(outGridData,
+                                                     outGridData.options().dtype(torch::kFloat32));
+                } else {
+                    _outGridData = outGridData;
+                }
+
+                auto batchAcc      = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto pointsDataAcc = tensorAccessor<DeviceTag, scalar_t, 2>(pointsData);
+                auto outGridDataAcc =
+                    tensorAccessor<DeviceTag, at::opmath_type<scalar_t>, 2>(_outGridData);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                             JaggedRAcc32<scalar_t, 2> pts) {
+                        splatIntoGridTrilinearCallback<DeviceTag, scalar_t, GridType, JaggedRAcc32,
+                                                       TorchRAcc32>(
+                            bidx, eidx, cidx, pts, pointsDataAcc, batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(256, pointsData.size(1), points,
+                                                                 cb);
+                } else {
+                    auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                  JaggedAcc<scalar_t, 2> pts) {
+                        splatIntoGridTrilinearCallback<DeviceTag, scalar_t, GridType, JaggedAcc,
+                                                       TorchAcc>(
+                            bidx, eidx, cidx, pts, pointsDataAcc, batchAcc, outGridDataAcc);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(pointsData.size(1), points, cb);
+                }
+
+                if (points.scalar_type() == at::kHalf) {
+                    outGridData.copy_(_outGridData);
+                }
+            }));
     });
     return outGridData;
 }
 
-
-
 template <>
-torch::Tensor dispatchSplatIntoGridTrilinear<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                         const JaggedTensor& points,
-                                                         const torch::Tensor& pointsDataa) {
+torch::Tensor
+dispatchSplatIntoGridTrilinear<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                             const JaggedTensor  &points,
+                                             const torch::Tensor &pointsDataa) {
     return SplatIntoGridTrilinear<torch::kCUDA>(batchHdl, points, pointsDataa);
 }
 
 template <>
-torch::Tensor dispatchSplatIntoGridTrilinear<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                        const JaggedTensor& points,
-                                                        const torch::Tensor& pointsDataa) {
+torch::Tensor
+dispatchSplatIntoGridTrilinear<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                            const JaggedTensor  &points,
+                                            const torch::Tensor &pointsDataa) {
     return SplatIntoGridTrilinear<torch::kCPU>(batchHdl, points, pointsDataa);
 }
 
diff --git a/fvdb/src/detail/ops/TransformPointToGrid.cu b/fvdb/src/detail/ops/TransformPointToGrid.cu
index de0dbfa085..d28bb7226a 100644
--- a/fvdb/src/detail/ops/TransformPointToGrid.cu
+++ b/fvdb/src/detail/ops/TransformPointToGrid.cu
@@ -1,102 +1,110 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void transformPointsToGridCallback(int32_t bidx, int32_t eidx,
-                                                      JaggedAccessor<ScalarType, 2> pts,
-                                                      TensorAccessor<ScalarType, 2> outPts,
-                                                      BatchGridAccessor<GridType> batchAccessor,
-                                                      bool primal) {
-    const auto tx = primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
-
-    const auto pt = pts.data()[eidx];
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+transformPointsToGridCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> pts,
+                              TensorAccessor<ScalarType, 2> outPts,
+                              BatchGridAccessor<GridType> batchAccessor, bool primal) {
+    const auto tx =
+        primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
+
+    const auto                            pt  = pts.data()[eidx];
     const nanovdb::math::Vec3<ScalarType> wci = tx.apply(pt[0], pt[1], pt[2]);
-    outPts[eidx][0] = wci[0];
-    outPts[eidx][1] = wci[1];
-    outPts[eidx][2] = wci[2];
+    outPts[eidx][0]                           = wci[0];
+    outPts[eidx][1]                           = wci[1];
+    outPts[eidx][2]                           = wci[2];
 }
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void transformPointsToGridBackwardCallback(int32_t bidx, int32_t eidx,
-                                                              JaggedAccessor<ScalarType, 2> gradOut,
-                                                              TensorAccessor<ScalarType, 2> outGradIn,
-                                                              BatchGridAccessor<GridType> batchAccessor,
-                                                              bool primal) {
-    const auto tx = primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
-    const auto gradOutI = gradOut.data()[eidx];
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+transformPointsToGridBackwardCallback(int32_t bidx, int32_t eidx,
+                                      JaggedAccessor<ScalarType, 2> gradOut,
+                                      TensorAccessor<ScalarType, 2> outGradIn,
+                                      BatchGridAccessor<GridType> batchAccessor, bool primal) {
+    const auto tx =
+        primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
+    const auto                            gradOutI = gradOut.data()[eidx];
     const nanovdb::math::Vec3<ScalarType> wci = tx.applyGrad(gradOutI[0], gradOutI[1], gradOutI[2]);
-    outGradIn[eidx][0] = wci[0] * gradOutI[0];
-    outGradIn[eidx][1] = wci[1] * gradOutI[1];
-    outGradIn[eidx][2] = wci[2] * gradOutI[2];
+    outGradIn[eidx][0]                        = wci[0] * gradOutI[0];
+    outGradIn[eidx][1]                        = wci[1] * gradOutI[1];
+    outGradIn[eidx][2]                        = wci[2] * gradOutI[2];
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void invTransformPointsToGridCallback(int32_t bidx, int32_t eidx,
-                                                         JaggedAccessor<ScalarType, 2> pts,
-                                                         TensorAccessor<ScalarType, 2> outPts,
-                                                         BatchGridAccessor<GridType> batchAccessor,
-                                                         bool primal) {
-    const auto tx = primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
-    const auto pt = pts.data()[eidx];
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+invTransformPointsToGridCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> pts,
+                                 TensorAccessor<ScalarType, 2> outPts,
+                                 BatchGridAccessor<GridType> batchAccessor, bool primal) {
+    const auto tx =
+        primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
+    const auto                            pt  = pts.data()[eidx];
     const nanovdb::math::Vec3<ScalarType> wci = tx.applyInv(pt[0], pt[1], pt[2]);
-    outPts[eidx][0] = wci[0];
-    outPts[eidx][1] = wci[1];
-    outPts[eidx][2] = wci[2];
+    outPts[eidx][0]                           = wci[0];
+    outPts[eidx][1]                           = wci[1];
+    outPts[eidx][2]                           = wci[2];
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void invTransformPointsToGridBackwardCallback(int32_t bidx, int32_t eidx,
-                                                                 JaggedAccessor<ScalarType, 2> gradOut,
-                                                                 TensorAccessor<ScalarType, 2> outGradIn,
-                                                                 BatchGridAccessor<GridType> batchAccessor,
-                                                                 bool primal) {
-    const auto tx = primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
-    const auto gradOutI = gradOut.data()[eidx];
-    const nanovdb::math::Vec3<ScalarType> wci = tx.applyInvGrad(gradOutI[0], gradOutI[1], gradOutI[2]);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+invTransformPointsToGridBackwardCallback(int32_t bidx, int32_t eidx,
+                                         JaggedAccessor<ScalarType, 2> gradOut,
+                                         TensorAccessor<ScalarType, 2> outGradIn,
+                                         BatchGridAccessor<GridType> batchAccessor, bool primal) {
+    const auto tx =
+        primal ? batchAccessor.primalTransform(bidx) : batchAccessor.dualTransform(bidx);
+    const auto                            gradOutI = gradOut.data()[eidx];
+    const nanovdb::math::Vec3<ScalarType> wci =
+        tx.applyInvGrad(gradOutI[0], gradOutI[1], gradOutI[2]);
     outGradIn[eidx][0] = wci[0] * gradOutI[0];
     outGradIn[eidx][1] = wci[1] * gradOutI[1];
     outGradIn[eidx][2] = wci[2] * gradOutI[2];
 }
 
-
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor TransformPointsToGrid(const GridBatchImpl& batchHdl,
-                                    const JaggedTensor& points,
-                                    bool isPrimal) {
+torch::Tensor
+TransformPointsToGrid(const GridBatchImpl &batchHdl, const JaggedTensor &points, bool isPrimal) {
     batchHdl.checkDevice(points);
     TORCH_CHECK_VALUE(points.rdim() == 2, "points must have shape [B*N, 3]");
     TORCH_CHECK_VALUE(points.rsize(-1) == 3, "points must have shape [B*N, 3]");
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
     points.check_valid();
 
-    auto opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
-    torch::Tensor outCoords = torch::empty({points.rsize(0), points.rsize(1)}, opts);
+    auto          opts      = torch::TensorOptions().dtype(points.dtype()).device(points.device());
+    torch::Tensor outCoords = torch::empty({ points.rsize(0), points.rsize(1) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "pointsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outCoordsAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outCoords);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    transformPointsToGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    transformPointsToGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, points, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    transformPointsToGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    transformPointsToGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, points, cb);
             }
@@ -105,33 +113,34 @@ torch::Tensor TransformPointsToGrid(const GridBatchImpl& batchHdl,
     return outCoords;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor InvTransformPointsToGrid(const GridBatchImpl& batchHdl,
-                                       const JaggedTensor& points,
-                                       bool isPrimal) {
+torch::Tensor
+InvTransformPointsToGrid(const GridBatchImpl &batchHdl, const JaggedTensor &points, bool isPrimal) {
     batchHdl.checkDevice(points);
     TORCH_CHECK_VALUE(points.rdim() == 2, "points must have shape [B*N, 3]");
     TORCH_CHECK_VALUE(points.rsize(-1) == 3, "points must have shape [B*N, 3]");
     TORCH_CHECK_TYPE(points.is_floating_point(), "points must have a floating point type");
     points.check_valid();
 
-    auto opts = torch::TensorOptions().dtype(points.dtype()).device(points.device());
-    torch::Tensor outCoords = torch::empty({points.rsize(0), points.rsize(1)}, opts);
+    auto          opts      = torch::TensorOptions().dtype(points.dtype()).device(points.device());
+    torch::Tensor outCoords = torch::empty({ points.rsize(0), points.rsize(1) }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(points.scalar_type(), "pointsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outCoordsAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outCoords);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    invTransformPointsToGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    invTransformPointsToGridCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, points, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    invTransformPointsToGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    invTransformPointsToGridCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ptsA, outCoordsAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, points, cb);
             }
@@ -140,26 +149,29 @@ torch::Tensor InvTransformPointsToGrid(const GridBatchImpl& batchHdl,
     return outCoords;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor TransformPointsToGridBackward(const GridBatchImpl& batchHdl,
-                                            const JaggedTensor& gradOut,
-                                            bool isPrimal) {
+torch::Tensor
+TransformPointsToGridBackward(const GridBatchImpl &batchHdl, const JaggedTensor &gradOut,
+                              bool isPrimal) {
     torch::Tensor outGradIn = torch::empty_like(gradOut.jdata());
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOut.scalar_type(), "pointsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradIn);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    transformPointsToGridBackwardCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    transformPointsToGridBackwardCallback<scalar_t, GridType, JaggedRAcc32,
+                                                          TorchRAcc32>(
+                        bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, gradOut, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    transformPointsToGridBackwardCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    transformPointsToGridBackwardCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, gradOut, cb);
             }
@@ -168,26 +180,30 @@ torch::Tensor TransformPointsToGridBackward(const GridBatchImpl& batchHdl,
     return outGradIn;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor InvTransformPointsToGridBackward(const GridBatchImpl& batchHdl,
-                                               const JaggedTensor& gradOut,
-                                               bool isPrimal) {
+torch::Tensor
+InvTransformPointsToGridBackward(const GridBatchImpl &batchHdl, const JaggedTensor &gradOut,
+                                 bool isPrimal) {
     torch::Tensor outGradIn = torch::empty_like(gradOut.jdata());
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND_HALF(gradOut.scalar_type(), "pointsInGrid", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc     = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outGradInAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outGradIn);
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    invTransformPointsToGridBackwardCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    invTransformPointsToGridBackwardCallback<scalar_t, GridType, JaggedRAcc32,
+                                                             TorchRAcc32>(
+                        bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(512, 1, gradOut, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    invTransformPointsToGridBackwardCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    invTransformPointsToGridBackwardCallback<scalar_t, GridType, JaggedAcc,
+                                                             TorchAcc>(
+                        bidx, eidx, ptsA, outGradInAcc, batchAcc, isPrimal);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, gradOut, cb);
             }
@@ -196,78 +212,62 @@ torch::Tensor InvTransformPointsToGridBackward(const GridBatchImpl& batchHdl,
     return outGradIn;
 }
 
-
-
-
 template <>
-torch::Tensor dispatchTransformPointsToGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                        const JaggedTensor& points,
-                                                        bool isPrimal) {
+torch::Tensor
+dispatchTransformPointsToGrid<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                            const JaggedTensor &points, bool isPrimal) {
     return TransformPointsToGrid<torch::kCUDA>(batchHdl, points, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchInvTransformPointsToGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                           const JaggedTensor& points,
-                                                           bool isPrimal) {
-   return InvTransformPointsToGrid<torch::kCUDA>(batchHdl, points, isPrimal);
+torch::Tensor
+dispatchInvTransformPointsToGrid<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                               const JaggedTensor &points, bool isPrimal) {
+    return InvTransformPointsToGrid<torch::kCUDA>(batchHdl, points, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchTransformPointsToGridBackward<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                const JaggedTensor& gradOut,
-                                                                bool isPrimal) {
+torch::Tensor
+dispatchTransformPointsToGridBackward<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                                    const JaggedTensor &gradOut, bool isPrimal) {
     return TransformPointsToGridBackward<torch::kCUDA>(batchHdl, gradOut, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchInvTransformPointsToGridBackward<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                   const JaggedTensor& gradOut,
-                                                                   bool isPrimal) {
+torch::Tensor
+dispatchInvTransformPointsToGridBackward<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                                       const JaggedTensor &gradOut, bool isPrimal) {
     return InvTransformPointsToGridBackward<torch::kCUDA>(batchHdl, gradOut, isPrimal);
 }
 
-
-
-
-
-
-
 template <>
-torch::Tensor dispatchTransformPointsToGrid<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                       const JaggedTensor& points,
-                                                       bool isPrimal) {
+torch::Tensor
+dispatchTransformPointsToGrid<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                           const JaggedTensor &points, bool isPrimal) {
     return TransformPointsToGrid<torch::kCPU>(batchHdl, points, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchInvTransformPointsToGrid<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                          const JaggedTensor& points,
-                                                          bool isPrimal) {
-   return InvTransformPointsToGrid<torch::kCPU>(batchHdl, points, isPrimal);
+torch::Tensor
+dispatchInvTransformPointsToGrid<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                              const JaggedTensor &points, bool isPrimal) {
+    return InvTransformPointsToGrid<torch::kCPU>(batchHdl, points, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchTransformPointsToGridBackward<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                               const JaggedTensor& gradOut,
-                                                               bool isPrimal) {
+torch::Tensor
+dispatchTransformPointsToGridBackward<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                                   const JaggedTensor &gradOut, bool isPrimal) {
     return TransformPointsToGridBackward<torch::kCPU>(batchHdl, gradOut, isPrimal);
 }
 
-
 template <>
-torch::Tensor dispatchInvTransformPointsToGridBackward<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                                  const JaggedTensor& gradOut,
-                                                                  bool isPrimal) {
+torch::Tensor
+dispatchInvTransformPointsToGridBackward<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                                      const JaggedTensor &gradOut, bool isPrimal) {
     return InvTransformPointsToGridBackward<torch::kCPU>(batchHdl, gradOut, isPrimal);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/UpsampleGridNearest.cu b/fvdb/src/detail/ops/UpsampleGridNearest.cu
index 400edc9d54..6f24b50ddb 100644
--- a/fvdb/src/detail/ops/UpsampleGridNearest.cu
+++ b/fvdb/src/detail/ops/UpsampleGridNearest.cu
@@ -1,30 +1,33 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <THC/THCAtomics.cuh>
 
-#include "detail/utils/cuda/Utils.cuh"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename Dtype, typename GridType, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void upsampleNearestVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                     GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                                     GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                                     const TensorAccessor<Dtype, 2> coarseData, // [B*N, C]
-                                                     TensorAccessor<Dtype, 2> outFineData,      // [B*N, C]
-                                                     nanovdb::Coord upsamplingFactor) {
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& fineLeaf = fineGrid->tree().template getFirstNode<0>()[leafIdx];
-
-    const auto coarseGridAcc = coarseGrid->getAccessor();
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+template <typename Dtype, typename GridType,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+upsampleNearestVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                             int32_t                           channelIdx,
+                             GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                             GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                             const TensorAccessor<Dtype, 2>    coarseData,  // [B*N, C]
+                             TensorAccessor<Dtype, 2>          outFineData, // [B*N, C]
+                             nanovdb::Coord                    upsamplingFactor) {
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &fineLeaf =
+        fineGrid->tree().template getFirstNode<0>()[leafIdx];
+
+    const auto    coarseGridAcc    = coarseGrid->getAccessor();
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
 
     const int64_t fineVoxelIndex = fineLeaf.getValue(voxelIdx);
@@ -34,10 +37,13 @@ __hostdev__ inline void upsampleNearestVoxelCallback(int32_t batchIdx, int32_t l
     }
 
     const nanovdb::Coord fineIjk = fineLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord coarseIjk = nanovdb::math::Vec3<Dtype>((Dtype) fineIjk[0] / (Dtype) upsamplingFactor[0],
-                                                          (Dtype) fineIjk[1] / (Dtype) upsamplingFactor[1],
-                                                          (Dtype) fineIjk[2] / (Dtype) upsamplingFactor[2]).floor();
-    const int64_t coarseIndex = coarseGridAcc.getValue(coarseIjk) - static_cast<int64_t>(1) + coarseBaseOffset;
+    const nanovdb::Coord coarseIjk =
+        nanovdb::math::Vec3<Dtype>((Dtype)fineIjk[0] / (Dtype)upsamplingFactor[0],
+                                   (Dtype)fineIjk[1] / (Dtype)upsamplingFactor[1],
+                                   (Dtype)fineIjk[2] / (Dtype)upsamplingFactor[2])
+            .floor();
+    const int64_t coarseIndex =
+        coarseGridAcc.getValue(coarseIjk) - static_cast<int64_t>(1) + coarseBaseOffset;
     const int64_t fineIndex = fineVoxelIndex - 1 + fineBaseOffset;
 
     if (coarseGridAcc.template get<ActiveOrUnmasked<GridType>>(coarseIjk)) {
@@ -45,21 +51,24 @@ __hostdev__ inline void upsampleNearestVoxelCallback(int32_t batchIdx, int32_t l
     }
 }
 
-
-template <typename Dtype, typename GridType, c10::DeviceType DeviceTag, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void upsampleNearestBackwardsVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                              GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
-                                                              GridBatchImpl::Accessor<GridType> fineBatchAccessor,
-                                                              const TensorAccessor<Dtype, 2> fineData, // [B*N, C]
-                                                              TensorAccessor<Dtype, 2> outCoarseData,  // [B*N, C]
-                                                              nanovdb::Coord upsamplingFactor) {
-    const nanovdb::NanoGrid<GridType>* coarseGrid = coarseBatchAccessor.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* fineGrid = fineBatchAccessor.grid(batchIdx);
-
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& fineLeaf = fineGrid->tree().template getFirstNode<0>()[leafIdx];
-
-    const auto coarseGridAcc = coarseGrid->getAccessor();
-    const int64_t fineBaseOffset = fineBatchAccessor.voxelOffset(batchIdx);
+template <typename Dtype, typename GridType, c10::DeviceType DeviceTag,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+upsampleNearestBackwardsVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                      int32_t                           channelIdx,
+                                      GridBatchImpl::Accessor<GridType> coarseBatchAccessor,
+                                      GridBatchImpl::Accessor<GridType> fineBatchAccessor,
+                                      const TensorAccessor<Dtype, 2>    fineData,      // [B*N, C]
+                                      TensorAccessor<Dtype, 2>          outCoarseData, // [B*N, C]
+                                      nanovdb::Coord                    upsamplingFactor) {
+    const nanovdb::NanoGrid<GridType> *coarseGrid = coarseBatchAccessor.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *fineGrid   = fineBatchAccessor.grid(batchIdx);
+
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &fineLeaf =
+        fineGrid->tree().template getFirstNode<0>()[leafIdx];
+
+    const auto    coarseGridAcc    = coarseGrid->getAccessor();
+    const int64_t fineBaseOffset   = fineBatchAccessor.voxelOffset(batchIdx);
     const int64_t coarseBaseOffset = coarseBatchAccessor.voxelOffset(batchIdx);
 
     const int64_t fineVoxelIndex = fineLeaf.getValue(voxelIdx);
@@ -69,15 +78,19 @@ __hostdev__ inline void upsampleNearestBackwardsVoxelCallback(int32_t batchIdx,
     }
 
     const nanovdb::Coord fineIjk = fineLeaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord coarseIjk = nanovdb::math::Vec3<Dtype>((Dtype) fineIjk[0] / (Dtype) upsamplingFactor[0],
-                                                          (Dtype) fineIjk[1] / (Dtype) upsamplingFactor[1],
-                                                          (Dtype) fineIjk[2] / (Dtype) upsamplingFactor[2]).floor();
-    const int64_t coarseIndex = coarseGridAcc.getValue(coarseIjk) - static_cast<int64_t>(1) + coarseBaseOffset;
+    const nanovdb::Coord coarseIjk =
+        nanovdb::math::Vec3<Dtype>((Dtype)fineIjk[0] / (Dtype)upsamplingFactor[0],
+                                   (Dtype)fineIjk[1] / (Dtype)upsamplingFactor[1],
+                                   (Dtype)fineIjk[2] / (Dtype)upsamplingFactor[2])
+            .floor();
+    const int64_t coarseIndex =
+        coarseGridAcc.getValue(coarseIjk) - static_cast<int64_t>(1) + coarseBaseOffset;
     const int64_t fineIndex = fineVoxelIndex - 1 + fineBaseOffset;
 
     if (coarseGridAcc.template get<ActiveOrUnmasked<GridType>>(coarseIjk)) {
         if constexpr (DeviceTag == torch::kCUDA) {
-            gpuAtomicAddNoReturn(&outCoarseData[coarseIndex][channelIdx], fineData[fineIndex][channelIdx]);
+            gpuAtomicAddNoReturn(&outCoarseData[coarseIndex][channelIdx],
+                                 fineData[fineIndex][channelIdx]);
         } else {
             // FIXME: (@fwilliams) Atomics
             outCoarseData[coarseIndex][channelIdx] += fineData[fineIndex][channelIdx];
@@ -85,12 +98,11 @@ __hostdev__ inline void upsampleNearestBackwardsVoxelCallback(int32_t batchIdx,
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor UpsampleGridNearest(const GridBatchImpl& coarseBatchAccessor,
-                                  const GridBatchImpl& fineBatchAccessor,
-                                  const torch::Tensor& coarseData,
-                                  nanovdb::Coord upsamplingFactor) {
+torch::Tensor
+UpsampleGridNearest(const GridBatchImpl &coarseBatchAccessor,
+                    const GridBatchImpl &fineBatchAccessor, const torch::Tensor &coarseData,
+                    nanovdb::Coord upsamplingFactor) {
     coarseBatchAccessor.checkNonEmptyGrid();
     fineBatchAccessor.checkNonEmptyGrid();
 
@@ -99,135 +111,152 @@ torch::Tensor UpsampleGridNearest(const GridBatchImpl& coarseBatchAccessor,
     }
     TORCH_CHECK(coarseData.dim() > 1,
                 "coarse_data must have more than one dimension. i.e. have shape (num_voxels, *)");
-    TORCH_CHECK(coarseData.size(0) == (int64_t) coarseBatchAccessor.totalVoxels(),
+    TORCH_CHECK(coarseData.size(0) == (int64_t)coarseBatchAccessor.totalVoxels(),
                 "coarse_data must have the same number of voxels as coarse_grid");
 
     const int64_t numOutputValues = fineBatchAccessor.totalVoxels();
     auto opts = torch::TensorOptions().dtype(coarseData.dtype()).device(coarseData.device());
-    torch::Tensor outFineData = torch::zeros(spliceShape({numOutputValues}, coarseData), opts);
+    torch::Tensor outFineData = torch::zeros(spliceShape({ numOutputValues }, coarseData), opts);
 
-    torch::Tensor coarseDataReshape = featureCoalescedView(coarseData);
+    torch::Tensor coarseDataReshape  = featureCoalescedView(coarseData);
     torch::Tensor outFineDataReshape = featureCoalescedView(outFineData);
-    TORCH_CHECK(outFineDataReshape.is_contiguous(), "out_fine_data must be contiguous. This should never happen");
+    TORCH_CHECK(outFineDataReshape.is_contiguous(),
+                "out_fine_data must be contiguous. This should never happen");
 
     FVDB_DISPATCH_GRID_TYPES(coarseBatchAccessor, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          coarseData.scalar_type(), "UpsampleGridNearest", [&]() {
-            auto coarseBatchAcc = gridBatchAccessor<DeviceTag, GridType>(coarseBatchAccessor);
-            auto coarseDataAcc = tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(coarseDataReshape);
-            auto outFineDataAcc = tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(outFineDataReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto callback = [=] __device__ (
-                    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
-                        upsampleNearestVoxelCallback<scalar_t, GridType, TorchRAcc64>(batchIdx, leafIdx, voxelIdx, channelIdx,
-                                                                        coarseBatchAcc, fineBatchAccessor,
-                                                                        coarseDataAcc, outFineDataAcc, upsamplingFactor);
+            at::ScalarType::Half, at::ScalarType::BFloat16, coarseData.scalar_type(),
+            "UpsampleGridNearest", [&]() {
+                auto coarseBatchAcc = gridBatchAccessor<DeviceTag, GridType>(coarseBatchAccessor);
+                auto coarseDataAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(coarseDataReshape);
+                auto outFineDataAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(outFineDataReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto callback =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
+                            upsampleNearestVoxelCallback<scalar_t, GridType, TorchRAcc64>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAcc,
+                                fineBatchAccessor, coarseDataAcc, outFineDataAcc, upsamplingFactor);
+                        };
+                    forEachVoxelCUDA<GridType>(640, outFineData.size(1), fineBatchAccessor,
+                                               callback);
+                } else {
+                    auto callback = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                        int32_t                           channelIdx,
+                                        GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
+                        upsampleNearestVoxelCallback<scalar_t, GridType, TorchAcc>(
+                            batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAcc,
+                            fineBatchAccessor, coarseDataAcc, outFineDataAcc, upsamplingFactor);
                     };
-                forEachVoxelCUDA<GridType>(640, outFineData.size(1), fineBatchAccessor, callback);
-            } else {
-                auto callback = [=] (
-                    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
-                        upsampleNearestVoxelCallback<scalar_t, GridType, TorchAcc>(batchIdx, leafIdx, voxelIdx, channelIdx,
-                                                                         coarseBatchAcc, fineBatchAccessor,
-                                                                         coarseDataAcc, outFineDataAcc, upsamplingFactor);
-                    };
-                forEachVoxelCPU<GridType>(outFineData.size(1), fineBatchAccessor, callback);
-            }
-        });
+                    forEachVoxelCPU<GridType>(outFineData.size(1), fineBatchAccessor, callback);
+                }
+            });
     });
 
     return outFineData;
 }
 
-
 template <c10::DeviceType DeviceTag>
-torch::Tensor UpsampleGridNearestBackward(const GridBatchImpl& fineBatchAccessor,
-                                          const GridBatchImpl& coarseBatchAccessor,
-                                          const torch::Tensor& gradOut,
-                                          const torch::Tensor& coarseData,
-                                          nanovdb::Coord upsamplingFactor) {
-
+torch::Tensor
+UpsampleGridNearestBackward(const GridBatchImpl &fineBatchAccessor,
+                            const GridBatchImpl &coarseBatchAccessor, const torch::Tensor &gradOut,
+                            const torch::Tensor &coarseData, nanovdb::Coord upsamplingFactor) {
     for (int i = 0; i < 3; i += 1) {
         TORCH_CHECK(upsamplingFactor[i] > 0, "upsampling_factor must be greater than 0");
     }
 
     torch::Tensor coarseDataReshape = featureCoalescedView(coarseData);
-    torch::Tensor gradOutReshape = featureCoalescedView(gradOut);
-    torch::Tensor outGradInReshape = torch::zeros_like(coarseDataReshape);
+    torch::Tensor gradOutReshape    = featureCoalescedView(gradOut);
+    torch::Tensor outGradInReshape  = torch::zeros_like(coarseDataReshape);
 
     FVDB_DISPATCH_GRID_TYPES(coarseBatchAccessor, [&]() {
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          gradOut.scalar_type(), "UpsampleGridNearestBackward", [&]() {
-            auto coarseBatchAcc = gridBatchAccessor<DeviceTag, GridType>(coarseBatchAccessor);
-            auto gradOutAcc = tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(gradOutReshape);
-            auto outCoarseDataAcc = tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(outGradInReshape);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto callback = [=] __device__ (
-                    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
-                        upsampleNearestBackwardsVoxelCallback<scalar_t, GridType, DeviceTag, TorchRAcc64>(batchIdx, leafIdx, voxelIdx, channelIdx,
-                                                                                coarseBatchAcc, fineBatchAccessor,
-                                                                                gradOutAcc, outCoarseDataAcc, upsamplingFactor);
+            at::ScalarType::Half, at::ScalarType::BFloat16, gradOut.scalar_type(),
+            "UpsampleGridNearestBackward", [&]() {
+                auto coarseBatchAcc = gridBatchAccessor<DeviceTag, GridType>(coarseBatchAccessor);
+                auto gradOutAcc = tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(gradOutReshape);
+                auto outCoarseDataAcc =
+                    tensorAccessor<DeviceTag, scalar_t, 2, int64_t>(outGradInReshape);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto callback =
+                        [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                       int32_t                           channelIdx,
+                                       GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
+                            upsampleNearestBackwardsVoxelCallback<scalar_t, GridType, DeviceTag,
+                                                                  TorchRAcc64>(
+                                batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAcc,
+                                fineBatchAccessor, gradOutAcc, outCoarseDataAcc, upsamplingFactor);
+                        };
+                    forEachVoxelCUDA<GridType>(640, outGradInReshape.size(1), fineBatchAccessor,
+                                               callback);
+                } else {
+                    auto callback = [=](int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                        int32_t                           channelIdx,
+                                        GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
+                        upsampleNearestBackwardsVoxelCallback<scalar_t, GridType, DeviceTag,
+                                                              TorchAcc>(
+                            batchIdx, leafIdx, voxelIdx, channelIdx, coarseBatchAcc,
+                            fineBatchAccessor, gradOutAcc, outCoarseDataAcc, upsamplingFactor);
                     };
-                forEachVoxelCUDA<GridType>(640, outGradInReshape.size(1), fineBatchAccessor, callback);
-            } else {
-                auto callback = [=] (
-                    int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx, GridBatchImpl::Accessor<GridType> fineBatchAccessor) {
-                        upsampleNearestBackwardsVoxelCallback<scalar_t, GridType, DeviceTag, TorchAcc>(batchIdx, leafIdx, voxelIdx, channelIdx,
-                                                                                coarseBatchAcc, fineBatchAccessor,
-                                                                                gradOutAcc, outCoarseDataAcc, upsamplingFactor);
-                    };
-                forEachVoxelCPU<GridType>(outGradInReshape.size(1), fineBatchAccessor, callback);
-            }
-        });
+                    forEachVoxelCPU<GridType>(outGradInReshape.size(1), fineBatchAccessor,
+                                              callback);
+                }
+            });
     });
-    torch::Tensor outGradIn = outGradInReshape.reshape(spliceShape({coarseData.size(0)}, gradOut));
-    TORCH_CHECK(outGradIn.is_contiguous(), "out_grad_in must be contiguous. This should never happen");
+    torch::Tensor outGradIn =
+        outGradInReshape.reshape(spliceShape({ coarseData.size(0) }, gradOut));
+    TORCH_CHECK(outGradIn.is_contiguous(),
+                "out_grad_in must be contiguous. This should never happen");
     return outGradIn;
 }
 
-
-
-
 template <>
-torch::Tensor dispatchUpsampleGridNearest<torch::kCUDA>(const GridBatchImpl& coarseBatchAccessor,
-                                                      const GridBatchImpl& fineBatchAccessor,
-                                                      const torch::Tensor& coarseData,
-                                                      nanovdb::Coord upsamplingFactor) {
-    return UpsampleGridNearest<torch::kCUDA>(coarseBatchAccessor, fineBatchAccessor, coarseData, upsamplingFactor);
+torch::Tensor
+dispatchUpsampleGridNearest<torch::kCUDA>(const GridBatchImpl &coarseBatchAccessor,
+                                          const GridBatchImpl &fineBatchAccessor,
+                                          const torch::Tensor &coarseData,
+                                          nanovdb::Coord       upsamplingFactor) {
+    return UpsampleGridNearest<torch::kCUDA>(coarseBatchAccessor, fineBatchAccessor, coarseData,
+                                             upsamplingFactor);
 }
 
-
 template <>
-torch::Tensor dispatchUpsampleGridNearest<torch::kCPU>(const GridBatchImpl& coarseBatchAccessor,
-                                                     const GridBatchImpl& fineBatchAccessor,
-                                                     const torch::Tensor& coarseData,
-                                                     nanovdb::Coord upsamplingFactor) {
-    return UpsampleGridNearest<torch::kCPU>(coarseBatchAccessor, fineBatchAccessor, coarseData, upsamplingFactor);
+torch::Tensor
+dispatchUpsampleGridNearest<torch::kCPU>(const GridBatchImpl &coarseBatchAccessor,
+                                         const GridBatchImpl &fineBatchAccessor,
+                                         const torch::Tensor &coarseData,
+                                         nanovdb::Coord       upsamplingFactor) {
+    return UpsampleGridNearest<torch::kCPU>(coarseBatchAccessor, fineBatchAccessor, coarseData,
+                                            upsamplingFactor);
 }
 
 template <>
-torch::Tensor dispatchUpsampleGridNearestBackward<torch::kCUDA>(const GridBatchImpl& fineBatchAccessor,
-                                                              const GridBatchImpl& coarseBatchAccessor,
-                                                              const torch::Tensor& gradOut,
-                                                              const torch::Tensor& coarseData,
-                                                              nanovdb::Coord upsamplingFactor) {
-    return UpsampleGridNearestBackward<torch::kCUDA>(fineBatchAccessor, coarseBatchAccessor, gradOut, coarseData, upsamplingFactor);
+torch::Tensor
+dispatchUpsampleGridNearestBackward<torch::kCUDA>(const GridBatchImpl &fineBatchAccessor,
+                                                  const GridBatchImpl &coarseBatchAccessor,
+                                                  const torch::Tensor &gradOut,
+                                                  const torch::Tensor &coarseData,
+                                                  nanovdb::Coord       upsamplingFactor) {
+    return UpsampleGridNearestBackward<torch::kCUDA>(fineBatchAccessor, coarseBatchAccessor,
+                                                     gradOut, coarseData, upsamplingFactor);
 }
 
 template <>
-torch::Tensor dispatchUpsampleGridNearestBackward<torch::kCPU>(const GridBatchImpl& fineBatchAccessor,
-                                                             const GridBatchImpl& coarseBatchAccessor,
-                                                             const torch::Tensor& gradOut,
-                                                             const torch::Tensor& coarseData,
-                                                             nanovdb::Coord upsamplingFactor) {
-    return UpsampleGridNearestBackward<torch::kCPU>(fineBatchAccessor, coarseBatchAccessor, gradOut, coarseData, upsamplingFactor);
+torch::Tensor
+dispatchUpsampleGridNearestBackward<torch::kCPU>(const GridBatchImpl &fineBatchAccessor,
+                                                 const GridBatchImpl &coarseBatchAccessor,
+                                                 const torch::Tensor &gradOut,
+                                                 const torch::Tensor &coarseData,
+                                                 nanovdb::Coord       upsamplingFactor) {
+    return UpsampleGridNearestBackward<torch::kCPU>(fineBatchAccessor, coarseBatchAccessor, gradOut,
+                                                    coarseData, upsamplingFactor);
 }
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/VolumeRender.cu b/fvdb/src/detail/ops/VolumeRender.cu
index e57f64e307..217eebd4d8 100644
--- a/fvdb/src/detail/ops/VolumeRender.cu
+++ b/fvdb/src/detail/ops/VolumeRender.cu
@@ -3,54 +3,48 @@
 //
 #include "detail/utils/cuda/Utils.cuh"
 
-#include <thrust/scan.h>
 #include <thrust/execution_policy.h>
+#include <thrust/scan.h>
 
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void volumeRenderFwdCallback(const TensorAccessor<scalar_t, 1> sigmas,
-                                         const TensorAccessor<scalar_t, 2> rgbs,
-                                         const TensorAccessor<scalar_t, 1> deltas,
-                                         const TensorAccessor<scalar_t, 1> ts,
-                                         const TensorAccessor<JOffsetsType, 1> jOffsets,
-                                         scalar_t tsmtThreshold,
-                                         int32_t numChannels,
-                                         int32_t rayIdx,
-                                         TensorAccessor<int64_t, 1> outTotalSamples,
-                                         TensorAccessor<scalar_t, 1> outOpacity,
-                                         TensorAccessor<scalar_t, 1> outDepth,
-                                         TensorAccessor<scalar_t, 2> outRGB,
-                                         TensorAccessor<scalar_t, 1> outWs) {
+__hostdev__ void
+volumeRenderFwdCallback(
+    const TensorAccessor<scalar_t, 1> sigmas, const TensorAccessor<scalar_t, 2> rgbs,
+    const TensorAccessor<scalar_t, 1> deltas, const TensorAccessor<scalar_t, 1> ts,
+    const TensorAccessor<JOffsetsType, 1> jOffsets, scalar_t tsmtThreshold, int32_t numChannels,
+    int32_t rayIdx, TensorAccessor<int64_t, 1> outTotalSamples,
+    TensorAccessor<scalar_t, 1> outOpacity, TensorAccessor<scalar_t, 1> outDepth,
+    TensorAccessor<scalar_t, 2> outRGB, TensorAccessor<scalar_t, 1> outWs) {
     const JOffsetsType sampleStartIdx = jOffsets[rayIdx];
-    const JOffsetsType numRaySamples = jOffsets[rayIdx+1] - sampleStartIdx;
+    const JOffsetsType numRaySamples  = jOffsets[rayIdx + 1] - sampleStartIdx;
 
     // front to back compositing
     JOffsetsType numSamples = 0;
-    scalar_t T = static_cast<scalar_t>(1.0);
+    scalar_t     T          = static_cast<scalar_t>(1.0);
 
     while (numSamples < numRaySamples) {
         const JOffsetsType s = sampleStartIdx + numSamples;
-        const scalar_t a = static_cast<scalar_t>(1.0) - c10::cuda::compat::exp(-sigmas[s]*deltas[s]);
+        const scalar_t     a =
+            static_cast<scalar_t>(1.0) - c10::cuda::compat::exp(-sigmas[s] * deltas[s]);
         const scalar_t w = a * T; // weight of the sample point
 
         // Forward pass works for arbitrary number of channels
         for (int c = 0; c < numChannels; ++c) {
-            outRGB[rayIdx][c] += w*rgbs[s][c];
+            outRGB[rayIdx][c] += w * rgbs[s][c];
         }
-        outDepth[rayIdx] += w*ts[s];
+        outDepth[rayIdx] += w * ts[s];
         // outDepthSq[rayIdx] += w*ts[s]*ts[s];
         outOpacity[rayIdx] += w;
         outWs[s] = w;
-        T *= static_cast<scalar_t>(1.0)-a;
+        T *= static_cast<scalar_t>(1.0) - a;
 
         // ray has enough opacity
         if (T <= tsmtThreshold) {
@@ -61,203 +55,187 @@ __hostdev__ void volumeRenderFwdCallback(const TensorAccessor<scalar_t, 1> sigma
     outTotalSamples[rayIdx] = numSamples;
 }
 
-
-template <c10::DeviceType device, typename scalar_t, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void volumeRenderBwdCallback(const TensorAccessor<scalar_t, 1> dLdOpacity,      // [B*R]
-                                         const TensorAccessor<scalar_t, 1> dLdDepth,        // [B*R]
-                                         const TensorAccessor<scalar_t, 2> dLdRgb,          // [B*R, C]
-                                         const TensorAccessor<scalar_t, 1> dLdWs,           // [B*R*S]
-                                         const TensorAccessor<scalar_t, 1> dLdWs_times_ws,  // [B*R*S]
-                                         const TensorAccessor<scalar_t, 1> sigmas,          // [B*R*S]
-                                         const TensorAccessor<scalar_t, 2> rgbs,            // [B*R*S, C]
-                                         const TensorAccessor<scalar_t, 1> deltas,          // [B*R*S]
-                                         const TensorAccessor<scalar_t, 1> ts,              // [B*R*S]
-                                         const TensorAccessor<JOffsetsType, 1> jOffsets,    // [B*R, 2]
-                                         const TensorAccessor<scalar_t, 1> opacity,         // [B*R]
-                                         const TensorAccessor<scalar_t, 1> depth,           // [B*R]
-                                         const TensorAccessor<scalar_t, 2> rgb,             // [B*R, C]
-                                         const scalar_t tsmtThreshold,                      // scalar
-                                         const int32_t rayIdx,
-                                         TensorAccessor<scalar_t, 1> out_dL_dsigmas,        // [B*R*S]
-                                         TensorAccessor<scalar_t, 2> out_dLdRgbs) {         // [B*R*S, C]
+template <c10::DeviceType device, typename scalar_t,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+volumeRenderBwdCallback(const TensorAccessor<scalar_t, 1>     dLdOpacity,     // [B*R]
+                        const TensorAccessor<scalar_t, 1>     dLdDepth,       // [B*R]
+                        const TensorAccessor<scalar_t, 2>     dLdRgb,         // [B*R, C]
+                        const TensorAccessor<scalar_t, 1>     dLdWs,          // [B*R*S]
+                        const TensorAccessor<scalar_t, 1>     dLdWs_times_ws, // [B*R*S]
+                        const TensorAccessor<scalar_t, 1>     sigmas,         // [B*R*S]
+                        const TensorAccessor<scalar_t, 2>     rgbs,           // [B*R*S, C]
+                        const TensorAccessor<scalar_t, 1>     deltas,         // [B*R*S]
+                        const TensorAccessor<scalar_t, 1>     ts,             // [B*R*S]
+                        const TensorAccessor<JOffsetsType, 1> jOffsets,       // [B*R, 2]
+                        const TensorAccessor<scalar_t, 1>     opacity,        // [B*R]
+                        const TensorAccessor<scalar_t, 1>     depth,          // [B*R]
+                        const TensorAccessor<scalar_t, 2>     rgb,            // [B*R, C]
+                        const scalar_t                        tsmtThreshold,  // scalar
+                        const int32_t rayIdx, TensorAccessor<scalar_t, 1> out_dL_dsigmas, // [B*R*S]
+                        TensorAccessor<scalar_t, 2> out_dLdRgbs) { // [B*R*S, C]
 
     const JOffsetsType sampleStartIdx = jOffsets[rayIdx];
-    const JOffsetsType numRaySamples = jOffsets[rayIdx+1] - sampleStartIdx;
+    const JOffsetsType numRaySamples  = jOffsets[rayIdx + 1] - sampleStartIdx;
 
     // front to back compositing
     JOffsetsType numSamples = 0;
-    scalar_t R = rgb[rayIdx][0], G = rgb[rayIdx][1], B = rgb[rayIdx][2];
-    scalar_t O = opacity[rayIdx], D = depth[rayIdx]; //, Dsq = depthSq[rayIdx];
-    scalar_t T = static_cast<scalar_t>(1.0), r = static_cast<scalar_t>(0.0),
+    scalar_t     R = rgb[rayIdx][0], G = rgb[rayIdx][1], B = rgb[rayIdx][2];
+    scalar_t     O = opacity[rayIdx], D = depth[rayIdx]; //, Dsq = depthSq[rayIdx];
+    scalar_t     T = static_cast<scalar_t>(1.0), r = static_cast<scalar_t>(0.0),
              g = static_cast<scalar_t>(0.0), b = static_cast<scalar_t>(0.0),
-             d = static_cast<scalar_t>(0.0);//, dsq = static_cast<scalar_t>(0.0);
+             d = static_cast<scalar_t>(0.0);             //, dsq = static_cast<scalar_t>(0.0);
     // compute prefix sum of dLdWs * ws
     // [a0, a1, a2, a3, ...] -> [a0, a0+a1, a0+a1+a2, a0+a1+a2+a3, ...]
     if constexpr (device == torch::kCUDA) {
-        thrust::inclusive_scan(thrust::device,
-                               dLdWs_times_ws.data()+sampleStartIdx,
-                               dLdWs_times_ws.data()+sampleStartIdx+numRaySamples,
-                               dLdWs_times_ws.data()+sampleStartIdx);
+        thrust::inclusive_scan(thrust::device, dLdWs_times_ws.data() + sampleStartIdx,
+                               dLdWs_times_ws.data() + sampleStartIdx + numRaySamples,
+                               dLdWs_times_ws.data() + sampleStartIdx);
     } else {
-        thrust::inclusive_scan(dLdWs_times_ws.data()+sampleStartIdx,
-                               dLdWs_times_ws.data()+sampleStartIdx+numRaySamples,
-                               dLdWs_times_ws.data()+sampleStartIdx);
+        thrust::inclusive_scan(dLdWs_times_ws.data() + sampleStartIdx,
+                               dLdWs_times_ws.data() + sampleStartIdx + numRaySamples,
+                               dLdWs_times_ws.data() + sampleStartIdx);
     }
-    scalar_t dLdWs_times_ws_sum = dLdWs_times_ws[sampleStartIdx+numRaySamples-1];
+    scalar_t dLdWs_times_ws_sum = dLdWs_times_ws[sampleStartIdx + numRaySamples - 1];
 
     while (numSamples < numRaySamples) {
         const JOffsetsType s = sampleStartIdx + numSamples;
-        const scalar_t a = static_cast<scalar_t>(1.0) - c10::cuda::compat::exp(-sigmas[s]*deltas[s]);
+        const scalar_t     a =
+            static_cast<scalar_t>(1.0) - c10::cuda::compat::exp(-sigmas[s] * deltas[s]);
         const scalar_t w = a * T;
 
-        r += w*rgbs[s][0]; g += w*rgbs[s][1]; b += w*rgbs[s][2];
-        d += w*ts[s]; //dsq += w*ts[s]*ts[s];
-        T *= static_cast<scalar_t>(1.0)-a;
+        r += w * rgbs[s][0];
+        g += w * rgbs[s][1];
+        b += w * rgbs[s][2];
+        d += w * ts[s]; // dsq += w*ts[s]*ts[s];
+        T *= static_cast<scalar_t>(1.0) - a;
 
         // compute gradients by math...
-        out_dLdRgbs[s][0] = dLdRgb[rayIdx][0]*w;
-        out_dLdRgbs[s][1] = dLdRgb[rayIdx][1]*w;
-        out_dLdRgbs[s][2] = dLdRgb[rayIdx][2]*w;
-
-        out_dL_dsigmas[s] = deltas[s] * (
-            dLdRgb[rayIdx][0]*(rgbs[s][0]*T-(R-r)) +
-            dLdRgb[rayIdx][1]*(rgbs[s][1]*T-(G-g)) +
-            dLdRgb[rayIdx][2]*(rgbs[s][2]*T-(B-b)) + // gradients from rgb
-            dLdOpacity[rayIdx]*(1-O) + // gradient from opacity
-            dLdDepth[rayIdx]*(ts[s]*T-(D-d)) + // gradient from depth
-            // dLdDepthSq[rayIdx]*(ts[s]*ts[s]*T-(Dsq-dsq)) +
-            T*dLdWs[s]-(dLdWs_times_ws_sum-dLdWs_times_ws[s]) // gradient from ws
-        );
-
-        if (T <= tsmtThreshold) break; // ray has enough opacity
+        out_dLdRgbs[s][0] = dLdRgb[rayIdx][0] * w;
+        out_dLdRgbs[s][1] = dLdRgb[rayIdx][1] * w;
+        out_dLdRgbs[s][2] = dLdRgb[rayIdx][2] * w;
+
+        out_dL_dsigmas[s] =
+            deltas[s] * (dLdRgb[rayIdx][0] * (rgbs[s][0] * T - (R - r)) +
+                         dLdRgb[rayIdx][1] * (rgbs[s][1] * T - (G - g)) +
+                         dLdRgb[rayIdx][2] * (rgbs[s][2] * T - (B - b)) + // gradients from rgb
+                         dLdOpacity[rayIdx] * (1 - O) +                   // gradient from opacity
+                         dLdDepth[rayIdx] * (ts[s] * T - (D - d)) +       // gradient from depth
+                         // dLdDepthSq[rayIdx]*(ts[s]*ts[s]*T-(Dsq-dsq)) +
+                         T * dLdWs[s] - (dLdWs_times_ws_sum - dLdWs_times_ws[s]) // gradient from ws
+                        );
+
+        if (T <= tsmtThreshold)
+            break; // ray has enough opacity
         numSamples++;
     }
 }
 
-
-
 template <typename scalar_t>
-void volumeRenderCPU(const TorchAcc<scalar_t, 1> sigmas,                // [B*R*S]
-                     const TorchAcc<scalar_t, 2> rgbs,                  // [B*R*S, C]
-                     const TorchAcc<scalar_t, 1> deltas,                // [B*R*S]
-                     const TorchAcc<scalar_t, 1> ts,                    // [B*R*S]
-                     const TorchAcc<JOffsetsType, 1> jOffsets,          // [B*R, 2]
-                     const scalar_t tsmtThreshold,                      // scalar
-                     TorchAcc<int64_t, 1> outTotalSamples,              // [B*R]
-                     TorchAcc<scalar_t, 1> outOpacity,                  // [B*R]
-                     TorchAcc<scalar_t, 1> outDepth,                    // [B*R]
-                     TorchAcc<scalar_t, 2> outRGB,                      // [B*R, C]
-                     TorchAcc<scalar_t, 1> outWs) {                     // [B*R*S]
+void
+volumeRenderCPU(const TorchAcc<scalar_t, 1>     sigmas,          // [B*R*S]
+                const TorchAcc<scalar_t, 2>     rgbs,            // [B*R*S, C]
+                const TorchAcc<scalar_t, 1>     deltas,          // [B*R*S]
+                const TorchAcc<scalar_t, 1>     ts,              // [B*R*S]
+                const TorchAcc<JOffsetsType, 1> jOffsets,        // [B*R, 2]
+                const scalar_t                  tsmtThreshold,   // scalar
+                TorchAcc<int64_t, 1>            outTotalSamples, // [B*R]
+                TorchAcc<scalar_t, 1>           outOpacity,      // [B*R]
+                TorchAcc<scalar_t, 1>           outDepth,        // [B*R]
+                TorchAcc<scalar_t, 2>           outRGB,          // [B*R, C]
+                TorchAcc<scalar_t, 1>           outWs) {                   // [B*R*S]
 
     const int numChannels = rgbs.size(1);
 
     for (int rayIdx = 0; rayIdx < (jOffsets.size(0) - 1); rayIdx += 1) {
         volumeRenderFwdCallback<scalar_t, TorchAcc>(
-            sigmas, rgbs, deltas, ts, jOffsets, tsmtThreshold, numChannels, rayIdx,
-            outTotalSamples, outOpacity, outDepth, outRGB, outWs);
+            sigmas, rgbs, deltas, ts, jOffsets, tsmtThreshold, numChannels, rayIdx, outTotalSamples,
+            outOpacity, outDepth, outRGB, outWs);
     }
 }
 
-
 template <typename scalar_t>
-void volumeRenderBackwardCPU(const TorchAcc<scalar_t, 1> dLdOpacity,      // [B*R]
-                             const TorchAcc<scalar_t, 1> dLdDepth,        // [B*R]
-                             const TorchAcc<scalar_t, 2> dLdRgb,          // [B*R, C]
-                             const TorchAcc<scalar_t, 1> dLdWs,           // [B*R*S]
-                             const TorchAcc<scalar_t, 1> dLdWs_times_ws,  // [B*R*S]
-                             const TorchAcc<scalar_t, 1> sigmas,          // [B*R*S]
-                             const TorchAcc<scalar_t, 2> rgbs,            // [B*R*S, C]
-                             const TorchAcc<scalar_t, 1> deltas,          // [B*R*S]
-                             const TorchAcc<scalar_t, 1> ts,              // [B*R*S]
-                             const TorchAcc<JOffsetsType, 1> jOffsets,    // [B*R, 2]
-                             const TorchAcc<scalar_t, 1> opacity,         // [B*R]
-                             const TorchAcc<scalar_t, 1> depth,           // [B*R]
-                             const TorchAcc<scalar_t, 2> rgb,             // [B*R, C]
-                             const scalar_t tsmtThreshold,                // scalar
-                             TorchAcc<scalar_t, 1> out_dL_dsigmas,        // [B*R*S]
-                             TorchAcc<scalar_t, 2> out_dLdRgbs) {         // [B*R*S, C]
+void
+volumeRenderBackwardCPU(const TorchAcc<scalar_t, 1>     dLdOpacity,     // [B*R]
+                        const TorchAcc<scalar_t, 1>     dLdDepth,       // [B*R]
+                        const TorchAcc<scalar_t, 2>     dLdRgb,         // [B*R, C]
+                        const TorchAcc<scalar_t, 1>     dLdWs,          // [B*R*S]
+                        const TorchAcc<scalar_t, 1>     dLdWs_times_ws, // [B*R*S]
+                        const TorchAcc<scalar_t, 1>     sigmas,         // [B*R*S]
+                        const TorchAcc<scalar_t, 2>     rgbs,           // [B*R*S, C]
+                        const TorchAcc<scalar_t, 1>     deltas,         // [B*R*S]
+                        const TorchAcc<scalar_t, 1>     ts,             // [B*R*S]
+                        const TorchAcc<JOffsetsType, 1> jOffsets,       // [B*R, 2]
+                        const TorchAcc<scalar_t, 1>     opacity,        // [B*R]
+                        const TorchAcc<scalar_t, 1>     depth,          // [B*R]
+                        const TorchAcc<scalar_t, 2>     rgb,            // [B*R, C]
+                        const scalar_t                  tsmtThreshold,  // scalar
+                        TorchAcc<scalar_t, 1>           out_dL_dsigmas, // [B*R*S]
+                        TorchAcc<scalar_t, 2>           out_dLdRgbs) {            // [B*R*S, C]
 
     for (int rayIdx = 0; rayIdx < (jOffsets.size(0) - 1); rayIdx += 1) {
         volumeRenderBwdCallback<torch::kCPU, scalar_t, TorchAcc>(
-            dLdOpacity, dLdDepth, dLdRgb, dLdWs, dLdWs_times_ws,
-            sigmas, rgbs, deltas, ts, jOffsets, opacity, depth, rgb, tsmtThreshold,
-            rayIdx, out_dL_dsigmas, out_dLdRgbs);
+            dLdOpacity, dLdDepth, dLdRgb, dLdWs, dLdWs_times_ws, sigmas, rgbs, deltas, ts, jOffsets,
+            opacity, depth, rgb, tsmtThreshold, rayIdx, out_dL_dsigmas, out_dLdRgbs);
     }
 }
 
-
 template <typename scalar_t>
-__global__ void volumeRender(
-    const TorchRAcc32<scalar_t, 1> sigmas,
-    const TorchRAcc32<scalar_t, 2> rgbs,
-    const TorchRAcc32<scalar_t, 1> deltas,
-    const TorchRAcc32<scalar_t, 1> ts,
-    const TorchRAcc32<JOffsetsType, 1> jOffsets,
-    const scalar_t tsmtThreshold,
-    TorchRAcc32<int64_t, 1> outTotalSamples,
-    TorchRAcc32<scalar_t, 1> outOpacity,
-    TorchRAcc32<scalar_t, 1> outDepth,
-    TorchRAcc32<scalar_t, 2> outRGB,
-    TorchRAcc32<scalar_t, 1> outWs) {
+__global__ void
+volumeRender(const TorchRAcc32<scalar_t, 1> sigmas, const TorchRAcc32<scalar_t, 2> rgbs,
+             const TorchRAcc32<scalar_t, 1> deltas, const TorchRAcc32<scalar_t, 1> ts,
+             const TorchRAcc32<JOffsetsType, 1> jOffsets, const scalar_t tsmtThreshold,
+             TorchRAcc32<int64_t, 1> outTotalSamples, TorchRAcc32<scalar_t, 1> outOpacity,
+             TorchRAcc32<scalar_t, 1> outDepth, TorchRAcc32<scalar_t, 2> outRGB,
+             TorchRAcc32<scalar_t, 1> outWs) {
     const int rayIdx = blockIdx.x * blockDim.x + threadIdx.x;
     if (rayIdx >= outOpacity.size(0)) {
         return;
     }
     const int numChannels = rgbs.size(1);
     volumeRenderFwdCallback<scalar_t, TorchRAcc32>(
-        sigmas, rgbs, deltas, ts, jOffsets, tsmtThreshold, numChannels, rayIdx,
-        outTotalSamples, outOpacity, outDepth, outRGB, outWs);
+        sigmas, rgbs, deltas, ts, jOffsets, tsmtThreshold, numChannels, rayIdx, outTotalSamples,
+        outOpacity, outDepth, outRGB, outWs);
 }
 
-
 template <typename scalar_t>
-__global__ void volumeRenderBackward(
-    const TorchRAcc32<scalar_t, 1> dLdOpacity,
-    const TorchRAcc32<scalar_t, 1> dLdDepth,
-    const TorchRAcc32<scalar_t, 2> dLdRgb,
-    const TorchRAcc32<scalar_t, 1> dLdWs,
-    const TorchRAcc32<scalar_t, 1> dLdWs_times_ws,
-    const TorchRAcc32<scalar_t, 1> sigmas,
-    const TorchRAcc32<scalar_t, 2> rgbs,
-    const TorchRAcc32<scalar_t, 1> deltas,
-    const TorchRAcc32<scalar_t, 1> ts,
-    const TorchRAcc32<JOffsetsType, 1> jOffsets,
-    const TorchRAcc32<scalar_t, 1> opacity,
-    const TorchRAcc32<scalar_t, 1> depth,
-    const TorchRAcc32<scalar_t, 2> rgb,
-    const scalar_t tsmtThreshold,
-    TorchRAcc32<scalar_t, 1> out_dL_dsigmas,
-    TorchRAcc32<scalar_t, 2> out_dLdRgbs
-) {
+__global__ void
+volumeRenderBackward(const TorchRAcc32<scalar_t, 1> dLdOpacity,
+                     const TorchRAcc32<scalar_t, 1> dLdDepth, const TorchRAcc32<scalar_t, 2> dLdRgb,
+                     const TorchRAcc32<scalar_t, 1> dLdWs,
+                     const TorchRAcc32<scalar_t, 1> dLdWs_times_ws,
+                     const TorchRAcc32<scalar_t, 1> sigmas, const TorchRAcc32<scalar_t, 2> rgbs,
+                     const TorchRAcc32<scalar_t, 1> deltas, const TorchRAcc32<scalar_t, 1> ts,
+                     const TorchRAcc32<JOffsetsType, 1> jOffsets,
+                     const TorchRAcc32<scalar_t, 1> opacity, const TorchRAcc32<scalar_t, 1> depth,
+                     const TorchRAcc32<scalar_t, 2> rgb, const scalar_t tsmtThreshold,
+                     TorchRAcc32<scalar_t, 1> out_dL_dsigmas,
+                     TorchRAcc32<scalar_t, 2> out_dLdRgbs) {
     const int rayIdx = blockIdx.x * blockDim.x + threadIdx.x;
     if (rayIdx >= opacity.size(0)) {
         return;
     }
     volumeRenderBwdCallback<torch::kCUDA, scalar_t, TorchRAcc32>(
-        dLdOpacity, dLdDepth, dLdRgb, dLdWs, dLdWs_times_ws,
-        sigmas, rgbs, deltas, ts, jOffsets, opacity, depth, rgb, tsmtThreshold,
-        rayIdx, out_dL_dsigmas, out_dLdRgbs);
+        dLdOpacity, dLdDepth, dLdRgb, dLdWs, dLdWs_times_ws, sigmas, rgbs, deltas, ts, jOffsets,
+        opacity, depth, rgb, tsmtThreshold, rayIdx, out_dL_dsigmas, out_dLdRgbs);
 }
 
-
-
-
-
 template <>
-void dispatchVolumeRender<torch::kCUDA>(
-    const torch::Tensor sigmas,        // [B*R*S]
-    const torch::Tensor rgbs,          // [B*R*S, C]
-    const torch::Tensor deltas,        // [B*R*S]
-    const torch::Tensor ts,            // [B*R*S]
-    const torch::Tensor jOffsets,      // JaggedTensor joffsets for sigmas, rgbs, deltas, ts [B*R, 2]
-    const float tsmtThreshold,
-    torch::Tensor& outOpacity,         // [B*R]
-    torch::Tensor& outDepth,           // [B*R]
-    torch::Tensor& outRgb,             // [B*R, C]
-    torch::Tensor& outWs,              // [B*R*S]
-    torch::Tensor& outTotalSamples) {  // [B*R]
+void
+dispatchVolumeRender<torch::kCUDA>(
+    const torch::Tensor sigmas,       // [B*R*S]
+    const torch::Tensor rgbs,         // [B*R*S, C]
+    const torch::Tensor deltas,       // [B*R*S]
+    const torch::Tensor ts,           // [B*R*S]
+    const torch::Tensor jOffsets,     // JaggedTensor joffsets for sigmas, rgbs, deltas, ts [B*R, 2]
+    const float         tsmtThreshold,
+    torch::Tensor      &outOpacity,   // [B*R]
+    torch::Tensor      &outDepth,     // [B*R]
+    torch::Tensor      &outRgb,       // [B*R, C]
+    torch::Tensor      &outWs,        // [B*R*S]
+    torch::Tensor      &outTotalSamples) { // [B*R]
     const int64_t numRays = jOffsets.size(0) - 1;
-    const int64_t N = sigmas.size(0);
+    const int64_t N       = sigmas.size(0);
 
     TORCH_CHECK(sigmas.device().is_cuda(), "sigmas must be a CUDA tensor");
     TORCH_CHECK(sigmas.device().has_index(), "sigmas must have CUDA index");
@@ -269,7 +247,8 @@ void dispatchVolumeRender<torch::kCUDA>(
     TORCH_CHECK(sigmas.device() == outDepth.device(), "All tensors must be on the same device");
     TORCH_CHECK(sigmas.device() == outRgb.device(), "All tensors must be on the same device");
     TORCH_CHECK(sigmas.device() == outWs.device(), "All tensors must be on the same device");
-    TORCH_CHECK(sigmas.device() == outTotalSamples.device(), "All tensors must be on the same device");
+    TORCH_CHECK(sigmas.device() == outTotalSamples.device(),
+                "All tensors must be on the same device");
 
     c10::cuda::CUDAGuard deviceGuard(sigmas.device());
 
@@ -278,53 +257,45 @@ void dispatchVolumeRender<torch::kCUDA>(
     // auto depthSq = torch::zeros({numRays}, sigmas.options());
     // auto rgb = torch::zeros({numRays, 3}, sigmas.options());
     // auto ws = torch::zeros({N}, sigmas.options());
-    // auto total_samples = torch::zeros({numRays}, torch::dtype(torch::kLong).device(sigmas.device()));
-
-    const int64_t threads = 1024, blocks = (numRays+threads-1)/threads;
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(sigmas.scalar_type(), "volumeRender",
-    ([&] {
-        volumeRender<scalar_t><<<blocks, threads>>>(
-            sigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            rgbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            deltas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            ts.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            jOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
-            tsmtThreshold,
-            outTotalSamples.packed_accessor32<int64_t, 1, torch::RestrictPtrTraits>(),
-            outOpacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            outDepth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            // outDepthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            outRgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            outWs.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>()
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }));
+    // auto total_samples = torch::zeros({numRays},
+    // torch::dtype(torch::kLong).device(sigmas.device()));
+
+    const int64_t threads = 1024, blocks = (numRays + threads - 1) / threads;
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(), "volumeRender", ([&] {
+            volumeRender<scalar_t><<<blocks, threads>>>(
+                sigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                rgbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+                deltas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                ts.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                jOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
+                tsmtThreshold,
+                outTotalSamples.packed_accessor32<int64_t, 1, torch::RestrictPtrTraits>(),
+                outOpacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                outDepth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                // outDepthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                outRgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+                outWs.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>());
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }));
 
     // return {total_samples, opacity, depth, depthSq, rgb, ws};
 }
 
 template <>
-void dispatchVolumeRenderBackward<torch::kCUDA>(
-    const torch::Tensor dLdOpacity,
-    const torch::Tensor dLdDepth,
-    // const torch::Tensor dLdDepthSq,
-    const torch::Tensor dLdRgb,
-    const torch::Tensor dLdWs,
-    const torch::Tensor sigmas,
-    const torch::Tensor rgbs,
-    const torch::Tensor ws,
-    const torch::Tensor deltas,
-    const torch::Tensor ts,
-    const torch::Tensor jOffsets,
-    const torch::Tensor opacity,
-    const torch::Tensor depth,
-    // const torch::Tensor depthSq,
-    const torch::Tensor rgb,
-    const float tsmtThreshold,
-    torch::Tensor& outDLdSigmas,
-    torch::Tensor& outDLdRbgs
-) {
+void
+dispatchVolumeRenderBackward<torch::kCUDA>(const torch::Tensor dLdOpacity,
+                                           const torch::Tensor dLdDepth,
+                                           // const torch::Tensor dLdDepthSq,
+                                           const torch::Tensor dLdRgb, const torch::Tensor dLdWs,
+                                           const torch::Tensor sigmas, const torch::Tensor rgbs,
+                                           const torch::Tensor ws, const torch::Tensor deltas,
+                                           const torch::Tensor ts, const torch::Tensor jOffsets,
+                                           const torch::Tensor opacity, const torch::Tensor depth,
+                                           // const torch::Tensor depthSq,
+                                           const torch::Tensor rgb, const float tsmtThreshold,
+                                           torch::Tensor &outDLdSigmas, torch::Tensor &outDLdRbgs) {
     TORCH_CHECK(dLdOpacity.device().is_cuda(), "dLdOpacity must be a CUDA tensor");
     TORCH_CHECK(dLdOpacity.device().has_index(), "dLdOpacity must have CUDA index");
     TORCH_CHECK(dLdOpacity.device() == dLdDepth.device(), "All tensors must be on the same device");
@@ -338,12 +309,14 @@ void dispatchVolumeRenderBackward<torch::kCUDA>(
     TORCH_CHECK(dLdOpacity.device() == opacity.device(), "All tensors must be on the same device");
     TORCH_CHECK(dLdOpacity.device() == depth.device(), "All tensors must be on the same device");
     TORCH_CHECK(dLdOpacity.device() == rgb.device(), "All tensors must be on the same device");
-    TORCH_CHECK(dLdOpacity.device() == outDLdSigmas.device(), "All tensors must be on the same device");
-    TORCH_CHECK(dLdOpacity.device() == outDLdRbgs.device(), "All tensors must be on the same device");
+    TORCH_CHECK(dLdOpacity.device() == outDLdSigmas.device(),
+                "All tensors must be on the same device");
+    TORCH_CHECK(dLdOpacity.device() == outDLdRbgs.device(),
+                "All tensors must be on the same device");
 
     c10::cuda::CUDAGuard deviceGuard(dLdOpacity.device());
 
-    const int64_t N = sigmas.size(0);
+    const int64_t N       = sigmas.size(0);
     const int64_t numRays = jOffsets.size(0) - 1;
 
     // auto dL_dsigmas = torch::zeros({N}, sigmas.options());
@@ -351,111 +324,81 @@ void dispatchVolumeRenderBackward<torch::kCUDA>(
 
     torch::Tensor dLdWs_times_ws = (dLdWs * ws); // auxiliary input
 
-    const int64_t threads = 1024, blocks = (numRays+threads-1)/threads;
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(sigmas.scalar_type(), "volumeRenderBackward",
-    ([&] {
-        volumeRenderBackward<scalar_t><<<blocks, threads>>>(
-            dLdOpacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            dLdDepth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            // dLdDepthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            dLdRgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            dLdWs.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            dLdWs_times_ws.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            sigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            rgbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            deltas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            ts.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            jOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
-            opacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            depth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            // depthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            rgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
-            tsmtThreshold,
-            outDLdSigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
-            outDLdRbgs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>()
-        );
-        C10_CUDA_KERNEL_LAUNCH_CHECK();
-    }));
+    const int64_t threads = 1024, blocks = (numRays + threads - 1) / threads;
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(), "volumeRenderBackward", ([&] {
+            volumeRenderBackward<scalar_t><<<blocks, threads>>>(
+                dLdOpacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                dLdDepth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                // dLdDepthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                dLdRgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+                dLdWs.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                dLdWs_times_ws.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                sigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                rgbs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(),
+                deltas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                ts.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                jOffsets.packed_accessor32<JOffsetsType, 1, torch::RestrictPtrTraits>(),
+                opacity.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                depth.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                // depthSq.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                rgb.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>(), tsmtThreshold,
+                outDLdSigmas.packed_accessor32<scalar_t, 1, torch::RestrictPtrTraits>(),
+                outDLdRbgs.packed_accessor32<scalar_t, 2, torch::RestrictPtrTraits>());
+            C10_CUDA_KERNEL_LAUNCH_CHECK();
+        }));
 
     // return {dL_dsigmas, dLdRgbs};
 }
 
-
 template <>
-void dispatchVolumeRender<torch::kCPU>(const torch::Tensor sigmas,
-                                     const torch::Tensor rgbs,
-                                     const torch::Tensor deltas,
-                                     const torch::Tensor ts,
-                                     const torch::Tensor jOffsets,
-                                     const float tsmtThreshold,
-                                     torch::Tensor& outOpacity,
-                                     torch::Tensor& outDepth,
-                                     torch::Tensor& outRgb,
-                                     torch::Tensor& outWs,
-                                     torch::Tensor& outTotalSamples) {
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(sigmas.scalar_type(), "volumeRender",
-    ([&] {
-        volumeRenderCPU<scalar_t>(
-            sigmas.accessor<scalar_t, 1>(),
-            rgbs.accessor<scalar_t, 2>(),
-            deltas.accessor<scalar_t, 1>(),
-            ts.accessor<scalar_t, 1>(),
-            jOffsets.accessor<JOffsetsType, 1>(),
-            tsmtThreshold,
-            outTotalSamples.accessor<int64_t, 1>(),
-            outOpacity.accessor<scalar_t, 1>(),
-            outDepth.accessor<scalar_t, 1>(),
-            outRgb.accessor<scalar_t, 2>(),
-            outWs.accessor<scalar_t, 1>()
-        );
-    }));
+void
+dispatchVolumeRender<torch::kCPU>(const torch::Tensor sigmas, const torch::Tensor rgbs,
+                                  const torch::Tensor deltas, const torch::Tensor ts,
+                                  const torch::Tensor jOffsets, const float tsmtThreshold,
+                                  torch::Tensor &outOpacity, torch::Tensor &outDepth,
+                                  torch::Tensor &outRgb, torch::Tensor &outWs,
+                                  torch::Tensor &outTotalSamples) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(), "volumeRender", ([&] {
+            volumeRenderCPU<scalar_t>(
+                sigmas.accessor<scalar_t, 1>(), rgbs.accessor<scalar_t, 2>(),
+                deltas.accessor<scalar_t, 1>(), ts.accessor<scalar_t, 1>(),
+                jOffsets.accessor<JOffsetsType, 1>(), tsmtThreshold,
+                outTotalSamples.accessor<int64_t, 1>(), outOpacity.accessor<scalar_t, 1>(),
+                outDepth.accessor<scalar_t, 1>(), outRgb.accessor<scalar_t, 2>(),
+                outWs.accessor<scalar_t, 1>());
+        }));
 }
 
 template <>
-void dispatchVolumeRenderBackward<torch::kCPU>(const torch::Tensor dLdOpacity,
-                                             const torch::Tensor dLdDepth,
-                                             const torch::Tensor dLdRgb,
-                                             const torch::Tensor dLdWs,
-                                             const torch::Tensor sigmas,
-                                             const torch::Tensor rgbs,
-                                             const torch::Tensor ws,
-                                             const torch::Tensor deltas,
-                                             const torch::Tensor ts,
-                                             const torch::Tensor jOffsets,
-                                             const torch::Tensor opacity,
-                                             const torch::Tensor depth,
-                                             const torch::Tensor rgb,
-                                             const float tsmtThreshold,
-                                             torch::Tensor& outDLdSigmas,
-                                             torch::Tensor& outDLdRbgs) {
+void
+dispatchVolumeRenderBackward<torch::kCPU>(const torch::Tensor dLdOpacity,
+                                          const torch::Tensor dLdDepth, const torch::Tensor dLdRgb,
+                                          const torch::Tensor dLdWs, const torch::Tensor sigmas,
+                                          const torch::Tensor rgbs, const torch::Tensor ws,
+                                          const torch::Tensor deltas, const torch::Tensor ts,
+                                          const torch::Tensor jOffsets, const torch::Tensor opacity,
+                                          const torch::Tensor depth, const torch::Tensor rgb,
+                                          const float tsmtThreshold, torch::Tensor &outDLdSigmas,
+                                          torch::Tensor &outDLdRbgs) {
     torch::Tensor dLdWs_times_ws = (dLdWs * ws); // auxiliary input
 
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(sigmas.scalar_type(), "volumeRenderBackward",
-    ([&] {
-        volumeRenderBackwardCPU<scalar_t>(
-            dLdOpacity.accessor<scalar_t, 1>(),
-            dLdDepth.accessor<scalar_t, 1>(),
-            dLdRgb.accessor<scalar_t, 2>(),
-            dLdWs.accessor<scalar_t, 1>(),
-            dLdWs_times_ws.accessor<scalar_t, 1>(),
-            sigmas.accessor<scalar_t, 1>(),
-            rgbs.accessor<scalar_t, 2>(),
-            deltas.accessor<scalar_t, 1>(),
-            ts.accessor<scalar_t, 1>(),
-            jOffsets.accessor<JOffsetsType, 1>(),
-            opacity.accessor<scalar_t, 1>(),
-            depth.accessor<scalar_t, 1>(),
-            rgb.accessor<scalar_t, 2>(),
-            tsmtThreshold,
-            outDLdSigmas.accessor<scalar_t, 1>(),
-            outDLdRbgs.accessor<scalar_t, 2>()
-        );
-    }));
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        sigmas.scalar_type(), "volumeRenderBackward", ([&] {
+            volumeRenderBackwardCPU<scalar_t>(
+                dLdOpacity.accessor<scalar_t, 1>(), dLdDepth.accessor<scalar_t, 1>(),
+                dLdRgb.accessor<scalar_t, 2>(), dLdWs.accessor<scalar_t, 1>(),
+                dLdWs_times_ws.accessor<scalar_t, 1>(), sigmas.accessor<scalar_t, 1>(),
+                rgbs.accessor<scalar_t, 2>(), deltas.accessor<scalar_t, 1>(),
+                ts.accessor<scalar_t, 1>(), jOffsets.accessor<JOffsetsType, 1>(),
+                opacity.accessor<scalar_t, 1>(), depth.accessor<scalar_t, 1>(),
+                rgb.accessor<scalar_t, 2>(), tsmtThreshold, outDLdSigmas.accessor<scalar_t, 1>(),
+                outDLdRbgs.accessor<scalar_t, 2>());
+        }));
 }
 
-
-}  // namespace ops
-}  // namespace detail
-}  // namespace fvdb
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/ops/VoxelNeighborhood.cu b/fvdb/src/detail/ops/VoxelNeighborhood.cu
index 835df71ddf..1a08345604 100644
--- a/fvdb/src/detail/ops/VoxelNeighborhood.cu
+++ b/fvdb/src/detail/ops/VoxelNeighborhood.cu
@@ -1,81 +1,84 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-
+#include <detail/utils/TrilinearInterpolationIterator.h>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ inline void voxelNeighborhoodCallback(int32_t bidx, int32_t eidx,
-                                                  JaggedAccessor<ScalarType, 2> coords,
-                                                  TensorAccessor<int64_t, 4> outIndex,
-                                                  BatchGridAccessor<GridType> batchAccessor,
-                                                  nanovdb::Coord extentMin,
-                                                  nanovdb::Coord extentMax,
-                                                  int32_t shift) {
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    auto acc = gpuGrid->getAccessor();
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ inline void
+voxelNeighborhoodCallback(int32_t bidx, int32_t eidx, JaggedAccessor<ScalarType, 2> coords,
+                          TensorAccessor<int64_t, 4>  outIndex,
+                          BatchGridAccessor<GridType> batchAccessor, nanovdb::Coord extentMin,
+                          nanovdb::Coord extentMax, int32_t shift) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
+    auto                               acc     = gpuGrid->getAccessor();
 
-    auto coord = coords.data()[eidx];
-    nanovdb::Coord ijk0 = nanovdb::Coord(coord[0], coord[1], coord[2]) << shift;
+    auto           coord = coords.data()[eidx];
+    nanovdb::Coord ijk0  = nanovdb::Coord(coord[0], coord[1], coord[2]) << shift;
 
     for (int32_t i = extentMin[0]; i <= extentMax[0]; i += 1) {
         for (int32_t j = extentMin[1]; j <= extentMax[1]; j += 1) {
             for (int32_t k = extentMin[2]; k <= extentMax[2]; k += 1) {
-                const nanovdb::Coord ijk = nanovdb::Coord(i, j, k) + ijk0;
-                const int64_t index = acc.template get<ActiveOrUnmasked<GridType>>(ijk) ? ((int64_t) acc.getValue(ijk) - 1) : -1;
+                const nanovdb::Coord ijk   = nanovdb::Coord(i, j, k) + ijk0;
+                const int64_t        index = acc.template get<ActiveOrUnmasked<GridType>>(ijk)
+                                                 ? ((int64_t)acc.getValue(ijk) - 1)
+                                                 : -1;
                 outIndex[eidx][i - extentMin[0]][j - extentMin[1]][k - extentMin[2]] = index;
             }
         }
     }
 }
 
-
 template <c10::DeviceType DeviceTag>
-JaggedTensor VoxelNeighborhood(const GridBatchImpl& batchHdl,
-                                     const JaggedTensor& ijk,
-                                     nanovdb::Coord extentMin,
-                                     nanovdb::Coord extentMax,
-                                     int32_t shift) {
+JaggedTensor
+VoxelNeighborhood(const GridBatchImpl &batchHdl, const JaggedTensor &ijk, nanovdb::Coord extentMin,
+                  nanovdb::Coord extentMax, int32_t shift) {
     batchHdl.checkDevice(ijk);
     TORCH_CHECK_TYPE(at::isIntegralType(ijk.scalar_type(), false), "ijk must have an integer type");
-    TORCH_CHECK(ijk.rdim() == 2, std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
-                                   std::to_string(ijk.rdim()) + " dimensions");
+    TORCH_CHECK(ijk.rdim() == 2,
+                std::string("Expected points to have 2 dimensions (shape (n, 3)) but got ") +
+                    std::to_string(ijk.rdim()) + " dimensions");
     TORCH_CHECK(ijk.rsize(0) > 0, "Empty tensor (coords)");
-    TORCH_CHECK(ijk.rsize(1) == 3,
-                "Expected 3 dimensional coords but got points.shape[1] = " +
-                std::to_string(ijk.rsize(1)));
+    TORCH_CHECK(ijk.rsize(1) == 3, "Expected 3 dimensional coords but got points.shape[1] = " +
+                                       std::to_string(ijk.rsize(1)));
 
     for (int i = 0; i < 3; i++) {
-        TORCH_CHECK(extentMin[i] <= extentMax[i], "Extent min must be less than or equal to extent max");
+        TORCH_CHECK(extentMin[i] <= extentMax[i],
+                    "Extent min must be less than or equal to extent max");
     }
     TORCH_CHECK(shift >= 0, "Bitshift must be non-negative");
     const nanovdb::Coord extentPerAxis = (extentMax - extentMin) + nanovdb::Coord(1);
-    const uint32_t numVals = extentPerAxis[0] * extentPerAxis[1] * extentPerAxis[2];
+    const uint32_t       numVals       = extentPerAxis[0] * extentPerAxis[1] * extentPerAxis[2];
 
-    auto opts = torch::TensorOptions().dtype(torch::kInt64).device(ijk.device());
-    torch::Tensor outIndex = torch::empty({ijk.rsize(0), extentPerAxis[0], extentPerAxis[1], extentPerAxis[2]}, opts);
+    auto          opts = torch::TensorOptions().dtype(torch::kInt64).device(ijk.device());
+    torch::Tensor outIndex =
+        torch::empty({ ijk.rsize(0), extentPerAxis[0], extentPerAxis[1], extentPerAxis[2] }, opts);
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         AT_DISPATCH_INTEGRAL_TYPES(ijk.scalar_type(), "VoxelNeighborhood", [&]() {
-
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+            auto batchAcc    = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
             auto outIndexAcc = tensorAccessor<DeviceTag, int64_t, 4>(outIndex);
 
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> ptsA) {
-                    voxelNeighborhoodCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(bidx, eidx, ptsA, outIndexAcc, batchAcc, extentMin, extentMax, shift);
+                auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                         JaggedRAcc32<scalar_t, 2> ptsA) {
+                    voxelNeighborhoodCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                        bidx, eidx, ptsA, outIndexAcc, batchAcc, extentMin, extentMax, shift);
                 };
                 forEachJaggedElementChannelCUDA<scalar_t, 2>(256, 1, ijk, cb);
             } else {
-                auto cb = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> ptsA) {
-                    voxelNeighborhoodCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(bidx, eidx, ptsA, outIndexAcc, batchAcc, extentMin, extentMax, shift);
+                auto cb = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                              JaggedAcc<scalar_t, 2> ptsA) {
+                    voxelNeighborhoodCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                        bidx, eidx, ptsA, outIndexAcc, batchAcc, extentMin, extentMax, shift);
                 };
                 forEachJaggedElementChannelCPU<scalar_t, 2>(1, ijk, cb);
             }
@@ -85,27 +88,22 @@ JaggedTensor VoxelNeighborhood(const GridBatchImpl& batchHdl,
     return ijk.jagged_like(outIndex);
 }
 
-
-
 template <>
-JaggedTensor dispatchVoxelNeighborhood<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                   const JaggedTensor& coords,
-                                                   nanovdb::Coord extentMin,
-                                                   nanovdb::Coord extentMax,
-                                                   int32_t shift) {
+JaggedTensor
+dispatchVoxelNeighborhood<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                        nanovdb::Coord extentMin, nanovdb::Coord extentMax,
+                                        int32_t shift) {
     return VoxelNeighborhood<torch::kCUDA>(batchHdl, coords, extentMin, extentMax, shift);
 }
 
 template <>
-JaggedTensor dispatchVoxelNeighborhood<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                  const JaggedTensor& coords,
-                                                   nanovdb::Coord extentMin,
-                                                   nanovdb::Coord extentMax,
-                                                   int32_t shift) {
+JaggedTensor
+dispatchVoxelNeighborhood<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &coords,
+                                       nanovdb::Coord extentMin, nanovdb::Coord extentMax,
+                                       int32_t shift) {
     return VoxelNeighborhood<torch::kCPU>(batchHdl, coords, extentMin, extentMax, shift);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/VoxelsAlongRays.cu b/fvdb/src/detail/ops/VoxelsAlongRays.cu
index 29b327c8bd..9d5f8bd29b 100644
--- a/fvdb/src/detail/ops/VoxelsAlongRays.cu
+++ b/fvdb/src/detail/ops/VoxelsAlongRays.cu
@@ -1,62 +1,62 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 // Called for each ray
-template <bool returnIjk, typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void voxelsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
-                                         const JaggedAccessor<ScalarType, 2> rayOrigins,      // [B*M, 3]
-                                         const JaggedAccessor<ScalarType, 2> rayDirections,   // [B*M, 3]
-                                         const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets,        // [B*M, 2]
-                                         TensorAccessor<fvdb::JIdxType, 1> outJIdx,                      // [B*M*S]
-                                         TensorAccessor<fvdb::JLIdxType, 2> outJLIdx,                    // [B*M, 2]
-                                         TensorAccessor<int32_t, 2> outVoxels,                // [B*M*S, 3]
-                                         TensorAccessor<ScalarType, 2> outTimes,              // [B*M*S, 2]
-                                         GridBatchImpl::Accessor<GridType> batchAccessor,
-                                         int64_t maxVox,
-                                         ScalarType eps,
-                                         bool cumulative) {
-
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.dualTransform(bidx);
-    const nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto primalAcc = gpuGrid->getAccessor();
-
-
-    const auto& rayO = rayOrigins.data()[rayIdx];
-    const auto& rayD = rayDirections.data()[rayIdx];
-    nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
+template <bool returnIjk, typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+voxelsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
+                        const JaggedAccessor<ScalarType, 2>         rayOrigins,    // [B*M, 3]
+                        const JaggedAccessor<ScalarType, 2>         rayDirections, // [B*M, 3]
+                        const TensorAccessor<fvdb::JOffsetsType, 1> outJOffsets,   // [B*M, 2]
+                        TensorAccessor<fvdb::JIdxType, 1>           outJIdx,       // [B*M*S]
+                        TensorAccessor<fvdb::JLIdxType, 2>          outJLIdx,      // [B*M, 2]
+                        TensorAccessor<int32_t, 2>                  outVoxels,     // [B*M*S, 3]
+                        TensorAccessor<ScalarType, 2>               outTimes,      // [B*M*S, 2]
+                        GridBatchImpl::Accessor<GridType> batchAccessor, int64_t maxVox,
+                        ScalarType eps, bool cumulative) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid   = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform = batchAccessor.dualTransform(bidx);
+    const nanovdb::CoordBBox           dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                               primalAcc = gpuGrid->getAccessor();
+
+    const auto                    &rayO = rayOrigins.data()[rayIdx];
+    const auto                    &rayD = rayDirections.data()[rayIdx];
+    nanovdb::math::Ray<ScalarType> rayVox =
+        transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
 
     if (outJLIdx.size(0) > 0) {
         const fvdb::JLIdxType batchStartIdx = rayOrigins.offsetStart(bidx);
-        outJLIdx[rayIdx][0] = bidx;
-        outJLIdx[rayIdx][1] = rayIdx - batchStartIdx;
+        outJLIdx[rayIdx][0]                 = bidx;
+        outJLIdx[rayIdx][1]                 = rayIdx - batchStartIdx;
     }
 
     if (!rayVox.clip(dualBbox)) {
         return;
     }
 
-    fvdb::JOffsetsType numVox = 0;
+    fvdb::JOffsetsType       numVox   = 0;
     const fvdb::JOffsetsType startIdx = outJOffsets[rayIdx];
-    for (auto it = HDDAVoxelIterator<decltype(primalAcc), ScalarType>(rayVox, primalAcc); it.isValid(); ++it) {
-        const ScalarType t0 = it->second.t0, t1 = it->second.t1;
-        const ScalarType deltaT = t1 - t0;
-        const nanovdb::Coord& ijk = it->first;
+    for (auto it = HDDAVoxelIterator<decltype(primalAcc), ScalarType>(rayVox, primalAcc);
+         it.isValid(); ++it) {
+        const ScalarType      t0 = it->second.t0, t1 = it->second.t1;
+        const ScalarType      deltaT = t1 - t0;
+        const nanovdb::Coord &ijk    = it->first;
 
         int32_t ijkIdx = -1;
         if constexpr (!returnIjk) {
             const int64_t baseOffset = cumulative ? batchAccessor.voxelOffset(bidx) : 0;
-            ijkIdx = primalAcc.getValue(ijk) - 1 + baseOffset;
+            ijkIdx                   = primalAcc.getValue(ijk) - 1 + baseOffset;
         }
         if (deltaT < eps) {
             continue;
@@ -67,21 +67,23 @@ __hostdev__ void voxelsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
             bool lastMatch = false;
             if constexpr (returnIjk) {
                 lastMatch = (ijk[0] == outVoxels[startIdx + numVox - 1][0] &&
-                            ijk[1] == outVoxels[startIdx + numVox - 1][1] &&
-                            ijk[2] == outVoxels[startIdx + numVox - 1][2]);
+                             ijk[1] == outVoxels[startIdx + numVox - 1][1] &&
+                             ijk[2] == outVoxels[startIdx + numVox - 1][2]);
             } else {
                 lastMatch = (ijkIdx == outVoxels[startIdx + numVox - 1][0]);
             }
             if (lastMatch) {
-                outTimes[startIdx + numVox - 1][0] = c10::cuda::compat::min(t0, outTimes[startIdx + numVox - 1][0]);
-                outTimes[startIdx + numVox - 1][1] = c10::cuda::compat::max(t1, outTimes[startIdx + numVox - 1][1]);
+                outTimes[startIdx + numVox - 1][0] =
+                    c10::cuda::compat::min(t0, outTimes[startIdx + numVox - 1][0]);
+                outTimes[startIdx + numVox - 1][1] =
+                    c10::cuda::compat::max(t1, outTimes[startIdx + numVox - 1][1]);
                 outJIdx[startIdx + numVox - 1] = rayIdx;
                 continue;
             }
         }
 
         if constexpr (returnIjk) {
-            #pragma unroll
+#pragma unroll
             for (int i = 0; i < 3; i += 1) {
                 outVoxels[startIdx + numVox][i] = ijk[i];
             }
@@ -90,7 +92,7 @@ __hostdev__ void voxelsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
         }
         outTimes[startIdx + numVox][0] = t0;
         outTimes[startIdx + numVox][1] = t1;
-        outJIdx[startIdx + numVox] = rayIdx;
+        outJIdx[startIdx + numVox]     = rayIdx;
 
         numVox += 1;
         if (numVox == maxVox) {
@@ -100,35 +102,37 @@ __hostdev__ void voxelsAlongRaysCallback(int32_t bidx, int32_t rayIdx,
     // assert(numVox == outJOffsets[rayIdx][1] - outJOffsets[rayIdx][0]);
 }
 
-
-template <typename ScalarType, typename GridType, template <typename T, int32_t D> typename JaggedAccessor, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void countVoxelsAlongRaysCallback(int32_t bidx, int32_t eidx,
-                                              const JaggedAccessor<ScalarType, 2> rayOrigins,
-                                              const JaggedAccessor<ScalarType, 2> rayDirections,
-                                              TensorAccessor<int32_t, 1> outCounts,
-                                              BatchGridAccessor<GridType> batchAccessor,
-                                              int64_t maxVox,
-                                              ScalarType eps) {
-
-    const nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
-    const VoxelCoordTransform& transform = batchAccessor.dualTransform(bidx);
-    const nanovdb::CoordBBox dualBbox = batchAccessor.dualBbox(bidx);
-    auto primalAcc = gpuGrid->getAccessor();
-
-    const auto& rayO = rayOrigins.data()[eidx];
-    const auto& rayD = rayDirections.data()[eidx];
-    nanovdb::math::Ray<ScalarType> rayVox = transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
+template <typename ScalarType, typename GridType,
+          template <typename T, int32_t D> typename JaggedAccessor,
+          template <typename T, int32_t D> typename TensorAccessor>
+__hostdev__ void
+countVoxelsAlongRaysCallback(int32_t bidx, int32_t eidx,
+                             const JaggedAccessor<ScalarType, 2> rayOrigins,
+                             const JaggedAccessor<ScalarType, 2> rayDirections,
+                             TensorAccessor<int32_t, 1>          outCounts,
+                             BatchGridAccessor<GridType> batchAccessor, int64_t maxVox,
+                             ScalarType eps) {
+    const nanovdb::NanoGrid<GridType> *gpuGrid   = batchAccessor.grid(bidx);
+    const VoxelCoordTransform         &transform = batchAccessor.dualTransform(bidx);
+    const nanovdb::CoordBBox           dualBbox  = batchAccessor.dualBbox(bidx);
+    auto                               primalAcc = gpuGrid->getAccessor();
+
+    const auto                    &rayO = rayOrigins.data()[eidx];
+    const auto                    &rayD = rayDirections.data()[eidx];
+    nanovdb::math::Ray<ScalarType> rayVox =
+        transform.applyToRay(rayO[0], rayO[1], rayO[2], rayD[0], rayD[1], rayD[2]);
 
     if (!rayVox.clip(dualBbox)) {
         outCounts[eidx + 1] = 0;
         return;
     }
-    int32_t numVox = 0;
+    int32_t        numVox = 0;
     nanovdb::Coord lastIjk;
-    for (auto it = HDDAVoxelIterator<decltype(primalAcc), ScalarType>(rayVox, primalAcc); it.isValid(); ++it) {
-        const ScalarType t0 = it->second.t0, t1 = it->second.t1;
-        const ScalarType deltaT = t1 - t0;
-        const nanovdb::Coord& ijk = it->first;
+    for (auto it = HDDAVoxelIterator<decltype(primalAcc), ScalarType>(rayVox, primalAcc);
+         it.isValid(); ++it) {
+        const ScalarType      t0 = it->second.t0, t1 = it->second.t1;
+        const ScalarType      deltaT = t1 - t0;
+        const nanovdb::Coord &ijk    = it->first;
         if (deltaT < eps) {
             continue;
         }
@@ -146,171 +150,186 @@ __hostdev__ void countVoxelsAlongRaysCallback(int32_t bidx, int32_t eidx,
 }
 
 template <c10::DeviceType DeviceTag>
-std::vector<JaggedTensor> VoxelsAlongRays(const GridBatchImpl& batchHdl,
-                                          const JaggedTensor& rayOrigins,
-                                          const JaggedTensor& rayDirections,
-                                          int64_t maxVox, float eps,
-                                          bool returnIjk, bool cumulative) {
+std::vector<JaggedTensor>
+VoxelsAlongRays(const GridBatchImpl &batchHdl, const JaggedTensor &rayOrigins,
+                const JaggedTensor &rayDirections, int64_t maxVox, float eps, bool returnIjk,
+                bool cumulative) {
     batchHdl.checkNonEmptyGrid();
     batchHdl.checkDevice(rayOrigins);
     batchHdl.checkDevice(rayDirections);
     TORCH_CHECK_VALUE(rayOrigins.rsize(1) == 3, "ray_origins must have shape (n, 3)");
     TORCH_CHECK_VALUE(rayDirections.rsize(1) == 3, "ray_directions must have shape (n, 3)");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(), "ray_origins must have the same batch size as the grid batch");
-    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(), "ray_directions must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayOrigins.num_outer_lists(),
+                      "ray_origins must have the same batch size as the grid batch");
+    TORCH_CHECK_VALUE(batchHdl.batchSize() == rayDirections.num_outer_lists(),
+                      "ray_directions must have the same batch size as the grid batch");
     TORCH_CHECK_TYPE(rayOrigins.is_floating_point(), "ray_origins must have a floating point type");
-    TORCH_CHECK_TYPE(rayDirections.is_floating_point(), "ray_directions must have a floating point type");
-    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(), "all tensors must have the same type");
-    TORCH_CHECK_VALUE(rayOrigins.rdim() == 2, std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
-                      std::to_string(rayOrigins.rdim()) + " dimensions");
-    TORCH_CHECK_VALUE(rayDirections.rdim() == 2, std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
-                      std::to_string(rayDirections.rdim()) + " dimensions");
-    TORCH_CHECK_VALUE(rayOrigins.rsize(0) == rayDirections.rsize(0),
-                      "ray_origins and ray_directions must have the same size in dimension 0 but got " +
-                      std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(rayDirections.rsize(0)));
+    TORCH_CHECK_TYPE(rayDirections.is_floating_point(),
+                     "ray_directions must have a floating point type");
+    TORCH_CHECK_TYPE(rayOrigins.dtype() == rayDirections.dtype(),
+                     "all tensors must have the same type");
+    TORCH_CHECK_VALUE(
+        rayOrigins.rdim() == 2,
+        std::string("Expected ray_origins to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayOrigins.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        rayDirections.rdim() == 2,
+        std::string("Expected ray_directions to have 2 dimensions (shape (n, 3)) but got ") +
+            std::to_string(rayDirections.rdim()) + " dimensions");
+    TORCH_CHECK_VALUE(
+        rayOrigins.rsize(0) == rayDirections.rsize(0),
+        "ray_origins and ray_directions must have the same size in dimension 0 but got " +
+            std::to_string(rayOrigins.rsize(0)) + " and " + std::to_string(rayDirections.rsize(0)));
     TORCH_CHECK_VALUE(eps >= 0.0, "eps must be positive or zero");
     TORCH_CHECK_VALUE(maxVox > 0 || maxVox == -1, "max_vox must be greater than zero or -1");
     TORCH_CHECK_VALUE(rayOrigins.ldim() == 1, "Invalid list dimension for ray origins.");
     TORCH_CHECK_VALUE(rayDirections.ldim() == 1, "Invalid list dimension for ray directions.");
 
     return FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() -> std::vector<JaggedTensor> {
-        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(rayOrigins.scalar_type(), "VoxelsAlongRays", [&]() -> std::vector<JaggedTensor> {
-            int64_t numThreads = 384;
-            if constexpr (nanovdb::util::is_same<scalar_t, double>::value || nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value) {
-                numThreads = 256;
-            }
-            const auto optsF = torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
-            const auto optsI32 = torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
-            const auto optsJIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
-            const auto optsJOffsets = torch::TensorOptions().dtype(fvdb::JOffsetsScalarType).device(rayOrigins.device());
-            const auto optsJLIdx = torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
-
-
-            // Count number of voxels along each ray
-            torch::Tensor rayCounts = torch::zeros({rayOrigins.rsize(0) + 1}, optsI32);  // [B*M]
-            auto outCountsAcc = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
-            auto batchAcc = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
-            auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb1 = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rOA) {
-                    countVoxelsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxVox, eps);
-                };
-                forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb1);
-            } else {
-                auto cb1 = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rOA) {
-                    countVoxelsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxVox, eps);
-                };
-                forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb1);
-            }
-
-            // Compute joffsets for the output ray intersections
-            const torch::Tensor outJOffsets = rayCounts.cumsum(0, fvdb::JOffsetsScalarType);  // [B*M]
-            const fvdb::JOffsetsType totalIsects = outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
-
-            // Allocate output JaggedTensor indexing data
-            torch::Tensor outJLidx = torch::empty({outJOffsets.size(0) - 1, 2}, optsJLIdx);    // [total_rays, 2]
-            torch::Tensor outJIdx = torch::zeros({totalIsects}, optsJIdx); // [total_intersections]
-
-            // Allocate output jdata tensors
-            torch::Tensor outVoxels = torch::zeros({totalIsects, returnIjk ? 3 : 1}, optsI32);  // [B*M*S, 3]
-            torch::Tensor outTimes = torch::zeros({totalIsects, 2}, optsF);   // [B*M*S, 2]
-
-            // Compute output voxels and times
-            auto outJOffsetsAcc = tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
-            auto outJIdxAcc = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
-            auto outJLIdxAcc = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
-
-            auto outVoxelsAcc = tensorAccessor<DeviceTag, int32_t, 2>(outVoxels);
-            auto outTimesAcc = tensorAccessor<DeviceTag, scalar_t, 2>(outTimes);
-
-            if constexpr (DeviceTag == torch::kCUDA) {
-                auto cbIjk = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
-                    voxelsAlongRaysCallback<true, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outVoxelsAcc, outTimesAcc,
-                        batchAcc, maxVox, eps, cumulative);
-                };
-                auto cbIdx = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
-                    voxelsAlongRaysCallback<false, scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outVoxelsAcc, outTimesAcc,
-                        batchAcc, maxVox, eps, cumulative);
-                };
-
-                if (returnIjk) {
-                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cbIjk);
+        return AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+            rayOrigins.scalar_type(), "VoxelsAlongRays", [&]() -> std::vector<JaggedTensor> {
+                int64_t numThreads = 384;
+                if constexpr (nanovdb::util::is_same<scalar_t, double>::value ||
+                              nanovdb::util::is_same<GridType, nanovdb::ValueOnIndexMask>::value) {
+                    numThreads = 256;
+                }
+                const auto optsF =
+                    torch::TensorOptions().dtype(rayOrigins.dtype()).device(rayOrigins.device());
+                const auto optsI32 =
+                    torch::TensorOptions().dtype(torch::kInt32).device(rayOrigins.device());
+                const auto optsJIdx =
+                    torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(rayOrigins.device());
+                const auto optsJOffsets = torch::TensorOptions()
+                                              .dtype(fvdb::JOffsetsScalarType)
+                                              .device(rayOrigins.device());
+                const auto optsJLIdx =
+                    torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(rayOrigins.device());
+
+                // Count number of voxels along each ray
+                torch::Tensor rayCounts =
+                    torch::zeros({ rayOrigins.rsize(0) + 1 }, optsI32); // [B*M]
+                auto outCountsAcc     = tensorAccessor<DeviceTag, int32_t, 1>(rayCounts);
+                auto batchAcc         = gridBatchAccessor<DeviceTag, GridType>(batchHdl);
+                auto rayDirectionsAcc = jaggedAccessor<DeviceTag, scalar_t, 2>(rayDirections);
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cb1 = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                              JaggedRAcc32<scalar_t, 2> rOA) {
+                        countVoxelsAlongRaysCallback<scalar_t, GridType, JaggedRAcc32, TorchRAcc32>(
+                            bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxVox, eps);
+                    };
+                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cb1);
                 } else {
-                    forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins, cbIdx);
+                    auto cb1 = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                   JaggedAcc<scalar_t, 2> rOA) {
+                        countVoxelsAlongRaysCallback<scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rOA, rayDirectionsAcc, outCountsAcc, batchAcc, maxVox, eps);
+                    };
+                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cb1);
                 }
-            } else {
-                auto cbIjk = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rayOriginsAcc) {
-                    voxelsAlongRaysCallback<true, scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outVoxelsAcc, outTimesAcc,
-                        batchAcc, maxVox, eps, cumulative);
-                };
-                auto cbIdx = [=] (int32_t bidx, int32_t eidx, int32_t cidx, JaggedAcc<scalar_t, 2> rayOriginsAcc) {
-                    voxelsAlongRaysCallback<false, scalar_t, GridType, JaggedAcc, TorchAcc>(
-                        bidx, eidx,
-                        rayOriginsAcc, rayDirectionsAcc,
-                        outJOffsetsAcc, outJIdxAcc, outJLIdxAcc,
-                        outVoxelsAcc, outTimesAcc,
-                        batchAcc, maxVox, eps, cumulative);
-                };
-                if (returnIjk) {
-                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cbIjk);
+
+                // Compute joffsets for the output ray intersections
+                const torch::Tensor outJOffsets =
+                    rayCounts.cumsum(0, fvdb::JOffsetsScalarType); // [B*M]
+                const fvdb::JOffsetsType totalIsects =
+                    outJOffsets[outJOffsets.size(0) - 1].item<fvdb::JOffsetsType>();
+
+                // Allocate output JaggedTensor indexing data
+                torch::Tensor outJLidx =
+                    torch::empty({ outJOffsets.size(0) - 1, 2 }, optsJLIdx); // [total_rays, 2]
+                torch::Tensor outJIdx =
+                    torch::zeros({ totalIsects }, optsJIdx); // [total_intersections]
+
+                // Allocate output jdata tensors
+                torch::Tensor outVoxels =
+                    torch::zeros({ totalIsects, returnIjk ? 3 : 1 }, optsI32);    // [B*M*S, 3]
+                torch::Tensor outTimes = torch::zeros({ totalIsects, 2 }, optsF); // [B*M*S, 2]
+
+                // Compute output voxels and times
+                auto outJOffsetsAcc = tensorAccessor<DeviceTag, fvdb::JOffsetsType, 1>(outJOffsets);
+                auto outJIdxAcc     = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(outJIdx);
+                auto outJLIdxAcc    = tensorAccessor<DeviceTag, fvdb::JLIdxType, 2>(outJLidx);
+
+                auto outVoxelsAcc = tensorAccessor<DeviceTag, int32_t, 2>(outVoxels);
+                auto outTimesAcc  = tensorAccessor<DeviceTag, scalar_t, 2>(outTimes);
+
+                if constexpr (DeviceTag == torch::kCUDA) {
+                    auto cbIjk = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                                JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
+                        voxelsAlongRaysCallback<true, scalar_t, GridType, JaggedRAcc32,
+                                                TorchRAcc32>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outVoxelsAcc, outTimesAcc, batchAcc, maxVox, eps,
+                            cumulative);
+                    };
+                    auto cbIdx = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                                JaggedRAcc32<scalar_t, 2> rayOriginsAcc) {
+                        voxelsAlongRaysCallback<false, scalar_t, GridType, JaggedRAcc32,
+                                                TorchRAcc32>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outVoxelsAcc, outTimesAcc, batchAcc, maxVox, eps,
+                            cumulative);
+                    };
+
+                    if (returnIjk) {
+                        forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins,
+                                                                     cbIjk);
+                    } else {
+                        forEachJaggedElementChannelCUDA<scalar_t, 2>(numThreads, 1, rayOrigins,
+                                                                     cbIdx);
+                    }
                 } else {
-                    forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cbIdx);
+                    auto cbIjk = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                     JaggedAcc<scalar_t, 2> rayOriginsAcc) {
+                        voxelsAlongRaysCallback<true, scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outVoxelsAcc, outTimesAcc, batchAcc, maxVox, eps,
+                            cumulative);
+                    };
+                    auto cbIdx = [=](int32_t bidx, int32_t eidx, int32_t cidx,
+                                     JaggedAcc<scalar_t, 2> rayOriginsAcc) {
+                        voxelsAlongRaysCallback<false, scalar_t, GridType, JaggedAcc, TorchAcc>(
+                            bidx, eidx, rayOriginsAcc, rayDirectionsAcc, outJOffsetsAcc, outJIdxAcc,
+                            outJLIdxAcc, outVoxelsAcc, outTimesAcc, batchAcc, maxVox, eps,
+                            cumulative);
+                    };
+                    if (returnIjk) {
+                        forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cbIjk);
+                    } else {
+                        forEachJaggedElementChannelCPU<scalar_t, 2>(1, rayOrigins, cbIdx);
+                    }
                 }
-            }
 
-            if (!returnIjk) {
-                outVoxels = outVoxels.squeeze(-1);
-            }
+                if (!returnIjk) {
+                    outVoxels = outVoxels.squeeze(-1);
+                }
 
-            const JaggedTensor retVox = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
-                outVoxels, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
-            const JaggedTensor retTimes = retVox.jagged_like(outTimes);
+                const JaggedTensor retVox = JaggedTensor::from_jdata_joffsets_jidx_and_lidx_unsafe(
+                    outVoxels, outJOffsets, outJIdx, outJLidx, batchHdl.batchSize());
+                const JaggedTensor retTimes = retVox.jagged_like(outTimes);
 
-            return {retVox, retTimes};
-        });
+                return { retVox, retTimes };
+            });
     });
 }
 
-
-
 template <>
-std::vector<JaggedTensor> dispatchVoxelsAlongRays<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                              const JaggedTensor& rayOrigins,
-                                                              const JaggedTensor& rayDirections,
-                                                              int64_t maxVox,
-                                                              float eps,
-                                                              bool returnIjk,
-                                                              bool cumulative) {
-    return VoxelsAlongRays<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, maxVox, eps, returnIjk, cumulative);
+std::vector<JaggedTensor>
+dispatchVoxelsAlongRays<torch::kCUDA>(const GridBatchImpl &batchHdl, const JaggedTensor &rayOrigins,
+                                      const JaggedTensor &rayDirections, int64_t maxVox, float eps,
+                                      bool returnIjk, bool cumulative) {
+    return VoxelsAlongRays<torch::kCUDA>(batchHdl, rayOrigins, rayDirections, maxVox, eps,
+                                         returnIjk, cumulative);
 }
 
 template <>
-std::vector<JaggedTensor> dispatchVoxelsAlongRays<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                             const JaggedTensor& rayOrigins,
-                                                             const JaggedTensor& rayDirections,
-                                                             int64_t maxVox,
-                                                             float eps,
-                                                             bool returnIjk,
-                                                             bool cumulative) {
-    return VoxelsAlongRays<torch::kCPU>(batchHdl, rayOrigins, rayDirections, maxVox, eps, returnIjk, cumulative);
+std::vector<JaggedTensor>
+dispatchVoxelsAlongRays<torch::kCPU>(const GridBatchImpl &batchHdl, const JaggedTensor &rayOrigins,
+                                     const JaggedTensor &rayDirections, int64_t maxVox, float eps,
+                                     bool returnIjk, bool cumulative) {
+    return VoxelsAlongRays<torch::kCPU>(batchHdl, rayOrigins, rayDirections, maxVox, eps, returnIjk,
+                                        cumulative);
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/VoxelsForGridBuilding.cu b/fvdb/src/detail/ops/VoxelsForGridBuilding.cu
index 01768b3463..bb1218c49a 100644
--- a/fvdb/src/detail/ops/VoxelsForGridBuilding.cu
+++ b/fvdb/src/detail/ops/VoxelsForGridBuilding.cu
@@ -1,33 +1,29 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAMathCompat.h>
 
-#include "detail/utils/cuda/Utils.cuh"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-__device__ inline void copyCoords(const fvdb::JIdxType bidx,
-                                  const int64_t base,
-                                  const nanovdb::Coord& ijk0,
-                                  const nanovdb::Coord& bmin,
-                                  const nanovdb::Coord& bmax,
-                                  TorchRAcc64<int32_t, 2> outIJK,
-                                  TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+__device__ inline void
+copyCoords(const fvdb::JIdxType bidx, const int64_t base, const nanovdb::Coord &ijk0,
+           const nanovdb::Coord &bmin, const nanovdb::Coord &bmax, TorchRAcc64<int32_t, 2> outIJK,
+           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
     static_assert(sizeof(nanovdb::Coord) == 3 * sizeof(int32_t));
     nanovdb::Coord ijk;
-    int32_t count = 0;
+    int32_t        count = 0;
     for (int di = bmin[0]; di <= bmax[0]; di += 1) {
         for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
             for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
-                ijk = ijk0 + nanovdb::Coord(di, dj, dk);
-                outIJK[base + count][0] = ijk[0];
-                outIJK[base + count][1] = ijk[1];
-                outIJK[base + count][2] = ijk[2];
+                ijk                      = ijk0 + nanovdb::Coord(di, dj, dk);
+                outIJK[base + count][0]  = ijk[0];
+                outIJK[base + count][1]  = ijk[1];
+                outIJK[base + count][2]  = ijk[2];
                 outIJKBIdx[base + count] = bidx;
                 count += 1;
             }
@@ -36,52 +32,45 @@ __device__ inline void copyCoords(const fvdb::JIdxType bidx,
 }
 
 template <typename GridType>
-__device__ inline void copyCoordsWithoutBorder(
-                                  const typename nanovdb::DefaultReadAccessor<GridType> gridAccessor,
-                                  const fvdb::JIdxType bidx,
-                                  const int64_t base,
-                                  const nanovdb::Coord& ijk0,
-                                  const nanovdb::Coord& bmin,
-                                  const nanovdb::Coord& bmax,
-                                  const TorchRAcc64<int64_t, 1> packInfoBase,
-                                  TorchRAcc64<int32_t, 2> outIJK,
-                                  TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+__device__ inline void
+copyCoordsWithoutBorder(const typename nanovdb::DefaultReadAccessor<GridType> gridAccessor,
+                        const fvdb::JIdxType bidx, const int64_t base, const nanovdb::Coord &ijk0,
+                        const nanovdb::Coord &bmin, const nanovdb::Coord &bmax,
+                        const TorchRAcc64<int64_t, 1> packInfoBase, TorchRAcc64<int32_t, 2> outIJK,
+                        TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
     static_assert(sizeof(nanovdb::Coord) == 3 * sizeof(int32_t));
     nanovdb::Coord ijk;
-    bool active = true;
+    bool           active = true;
     for (int di = bmin[0]; di <= bmax[0]; di += 1) {
         for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
             for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
-                ijk = ijk0 + nanovdb::Coord(di, dj, dk);
+                ijk    = ijk0 + nanovdb::Coord(di, dj, dk);
                 active = active && gridAccessor.isActive(ijk);
             }
         }
     }
     if (active) {
-        int64_t outBase = packInfoBase[base];
-        outIJK[outBase][0] = ijk0[0];
-        outIJK[outBase][1] = ijk0[1];
-        outIJK[outBase][2] = ijk0[2];
+        int64_t outBase     = packInfoBase[base];
+        outIJK[outBase][0]  = ijk0[0];
+        outIJK[outBase][1]  = ijk0[1];
+        outIJK[outBase][2]  = ijk0[2];
         outIJKBIdx[outBase] = bidx;
     }
 }
 
 template <typename GridType>
-__device__ inline void countCoordsWithoutBorder(
-                                  const typename nanovdb::DefaultReadAccessor<GridType> gridAccessor,
-                                  const fvdb::JIdxType bidx,
-                                  const int64_t base,
-                                  const nanovdb::Coord& ijk0,
-                                  const nanovdb::Coord& bmin,
-                                  const nanovdb::Coord& bmax,
-                                  TorchRAcc64<int64_t, 1> outCounter) {
+__device__ inline void
+countCoordsWithoutBorder(const typename nanovdb::DefaultReadAccessor<GridType> gridAccessor,
+                         const fvdb::JIdxType bidx, const int64_t base, const nanovdb::Coord &ijk0,
+                         const nanovdb::Coord &bmin, const nanovdb::Coord &bmax,
+                         TorchRAcc64<int64_t, 1> outCounter) {
     static_assert(sizeof(nanovdb::Coord) == 3 * sizeof(int32_t));
     nanovdb::Coord ijk;
-    bool active = true;
+    bool           active = true;
     for (int di = bmin[0]; di <= bmax[0]; di += 1) {
         for (int dj = bmin[1]; dj <= bmax[1]; dj += 1) {
             for (int dk = bmin[2]; dk <= bmax[2]; dk += 1) {
-                ijk = ijk0 + nanovdb::Coord(di, dj, dk);
+                ijk    = ijk0 + nanovdb::Coord(di, dj, dk);
                 active = active && gridAccessor.isActive(ijk);
             }
         }
@@ -90,36 +79,36 @@ __device__ inline void countCoordsWithoutBorder(
     outCounter[base] = active ? 1 : 0;
 }
 
-__device__ inline void copyCoords(const fvdb::JIdxType bidx,
-                                  const int64_t base,
-                                  const nanovdb::Coord size,
-                                  const nanovdb::Coord& ijk0,
-                                  TorchRAcc64<int32_t, 2> outIJK,
-                                  TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-    return copyCoords(bidx, base, ijk0, nanovdb::Coord(0), size - nanovdb::Coord(1), outIJK, outIJKBIdx);
+__device__ inline void
+copyCoords(const fvdb::JIdxType bidx, const int64_t base, const nanovdb::Coord size,
+           const nanovdb::Coord &ijk0, TorchRAcc64<int32_t, 2> outIJK,
+           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+    return copyCoords(bidx, base, ijk0, nanovdb::Coord(0), size - nanovdb::Coord(1), outIJK,
+                      outIJKBIdx);
 }
 
 template <typename GridType>
-__device__ void convIjkForGridCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                       const GridBatchImpl::Accessor<GridType> batchAcc,
-                                       const nanovdb::Coord& kernelSize, const nanovdb::Coord& stride,
-                                       int kernelVolume,
-                                       TorchRAcc32<int32_t, 2> outIJKData,
-                                       TorchRAcc32<fvdb::JIdxType, 1> outIJKBIdx,
-                                       TorchRAcc32<bool, 1> outMask) {
-
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
-    if (!leaf.isActive(vidx)) return;
-
-    const nanovdb::Coord& srcIjk = leaf.offsetToGlobalCoord(vidx);
-    const int64_t index = ((int64_t) leaf.getValue(vidx)) - 1;
-    const int64_t baseOffset = batchAcc.voxelOffset(bidx);
+__device__ void
+convIjkForGridCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                       const GridBatchImpl::Accessor<GridType> batchAcc,
+                       const nanovdb::Coord &kernelSize, const nanovdb::Coord &stride,
+                       int kernelVolume, TorchRAcc32<int32_t, 2> outIJKData,
+                       TorchRAcc32<fvdb::JIdxType, 1> outIJKBIdx, TorchRAcc32<bool, 1> outMask) {
+    const nanovdb::NanoGrid<GridType>                        *gridPtr = batchAcc.grid(bidx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
+    if (!leaf.isActive(vidx))
+        return;
+
+    const nanovdb::Coord &srcIjk     = leaf.offsetToGlobalCoord(vidx);
+    const int64_t         index      = ((int64_t)leaf.getValue(vidx)) - 1;
+    const int64_t         baseOffset = batchAcc.voxelOffset(bidx);
 
     int lower[3], upper[3];
-    for(int i = 0; i < 3; ++i) {
+    for (int i = 0; i < 3; ++i) {
         if (kernelSize[i] % 2 == 0) {
-            lower[i] = 0; upper[i] = kernelSize[i] - 1;
+            lower[i] = 0;
+            upper[i] = kernelSize[i] - 1;
         } else {
             lower[i] = -(kernelSize[i] - 1) / 2;
             upper[i] = (kernelSize[i] - 1) / 2;
@@ -131,453 +120,513 @@ __device__ void convIjkForGridCallback(int32_t bidx, int32_t lidx, int32_t vidx,
         for (int dj = lower[1]; dj <= upper[1]; dj += 1) {
             for (int dk = lower[2]; dk <= upper[2]; dk += 1, count += 1) {
                 const nanovdb::Coord dstIjk = srcIjk + nanovdb::Coord(dk, dj, di);
-                if (dstIjk[0] % stride[2] != 0 || dstIjk[1] % stride[1] != 0 || dstIjk[2] % stride[0] != 0) continue;
-                //  The original torchsparse implementation has a weird bug that checks the coordsMin.
-                //  if (dstIjk[0] < coordsMin[0] || dstIjk[1] < coordsMin[1] || dstIjk[2] < coordsMin[2])
+                if (dstIjk[0] % stride[2] != 0 || dstIjk[1] % stride[1] != 0 ||
+                    dstIjk[2] % stride[0] != 0)
+                    continue;
+                //  The original torchsparse implementation has a weird bug that checks the
+                //  coordsMin. if (dstIjk[0] < coordsMin[0] || dstIjk[1] < coordsMin[1] || dstIjk[2]
+                //  < coordsMin[2])
                 //      continue;
-                // if (dstIjk[0] > coordsMax[0] || dstIjk[1] > coordsMax[1] || dstIjk[2] > coordsMax[2])
+                // if (dstIjk[0] > coordsMax[0] || dstIjk[1] > coordsMax[1] || dstIjk[2] >
+                // coordsMax[2])
                 //     continue;
 
-                const int64_t base = (baseOffset + index) * kernelVolume + count;
+                const int64_t base  = (baseOffset + index) * kernelVolume + count;
                 outIJKData[base][0] = dstIjk[0] / stride[2];
                 outIJKData[base][1] = dstIjk[1] / stride[1];
                 outIJKData[base][2] = dstIjk[2] / stride[0];
-                outIJKBIdx[base] = bidx;
-                outMask[base] = true;
+                outIJKBIdx[base]    = bidx;
+                outMask[base]       = true;
             }
         }
     }
-
 }
 
-
 template <typename ScalarT>
-__device__ void paddedIJKForPointsCallback(int32_t bidx, int32_t eidx,
-                                           const JaggedRAcc32<ScalarT, 2> points,
-                                           const VoxelCoordTransform* transforms,
-                                           const int32_t totalPadAmount,
-                                           const nanovdb::Coord bmin,
-                                           const nanovdb::Coord bmax,
-                                           TorchRAcc64<int32_t, 2> outIJKData,
-                                           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-    using MathT = typename at::opmath_type<ScalarT>;
-    const auto& point = points.data()[eidx];
-    const VoxelCoordTransform& transform = transforms[bidx];
-    const nanovdb::Coord ijk0 = transform.apply(static_cast<MathT>(point[0]),
-                                                static_cast<MathT>(point[1]),
-                                                static_cast<MathT>(point[2])).round();
+__device__ void
+paddedIJKForPointsCallback(int32_t bidx, int32_t eidx, const JaggedRAcc32<ScalarT, 2> points,
+                           const VoxelCoordTransform *transforms, const int32_t totalPadAmount,
+                           const nanovdb::Coord bmin, const nanovdb::Coord bmax,
+                           TorchRAcc64<int32_t, 2>        outIJKData,
+                           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+    using MathT                          = typename at::opmath_type<ScalarT>;
+    const auto                &point     = points.data()[eidx];
+    const VoxelCoordTransform &transform = transforms[bidx];
+    const nanovdb::Coord       ijk0 =
+        transform
+            .apply(static_cast<MathT>(point[0]), static_cast<MathT>(point[1]),
+                   static_cast<MathT>(point[2]))
+            .round();
     const int64_t base = eidx * totalPadAmount;
     copyCoords(bidx, base, ijk0, bmin, bmax, outIJKData, outIJKBIdx);
 }
 
 template <typename ScalarT>
-__device__ void paddedIJKForCoordsCallback(int32_t bidx, int32_t eidx,
-                                           const JaggedRAcc32<ScalarT, 2> coords,
-                                           const int32_t totalPadAmount,
-                                           const nanovdb::Coord bmin,
-                                           const nanovdb::Coord bmax,
-                                           TorchRAcc64<int32_t, 2> outIJKData,
-                                           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-    const auto coord = coords.data()[eidx];
+__device__ void
+paddedIJKForCoordsCallback(int32_t bidx, int32_t eidx, const JaggedRAcc32<ScalarT, 2> coords,
+                           const int32_t totalPadAmount, const nanovdb::Coord bmin,
+                           const nanovdb::Coord bmax, TorchRAcc64<int32_t, 2> outIJKData,
+                           TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+    const auto           coord = coords.data()[eidx];
     const nanovdb::Coord ijk0(coord[0], coord[1], coord[2]);
-    const int32_t base = eidx * totalPadAmount;
+    const int32_t        base = eidx * totalPadAmount;
     copyCoords(bidx, base, ijk0, bmin, bmax, outIJKData, outIJKBIdx);
 }
 
-
 template <typename ScalarT>
-__device__ void nearestNeighborIJKForPointCallback(fvdb::JIdxType bidx, int32_t eidx,
-                                                   const JaggedRAcc32<ScalarT, 2> points,
-                                                   const VoxelCoordTransform* transforms,
-                                                   TorchRAcc64<int32_t, 2> outIJKData,
-                                                   TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+__device__ void
+nearestNeighborIJKForPointCallback(fvdb::JIdxType bidx, int32_t eidx,
+                                   const JaggedRAcc32<ScalarT, 2> points,
+                                   const VoxelCoordTransform     *transforms,
+                                   TorchRAcc64<int32_t, 2>        outIJKData,
+                                   TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
     static_assert(sizeof(nanovdb::Coord) == 3 * sizeof(int32_t));
 
     using MathT = typename at::opmath_type<ScalarT>;
 
-    const auto pt = points.data()[eidx];
-    const VoxelCoordTransform& transform = transforms[bidx];
-    const nanovdb::Coord ijk0 = transform.apply(static_cast<MathT>(pt[0]),
-                                                static_cast<MathT>(pt[1]),
-                                                static_cast<MathT>(pt[2])).floor();
+    const auto                 pt        = points.data()[eidx];
+    const VoxelCoordTransform &transform = transforms[bidx];
+    const nanovdb::Coord       ijk0 =
+        transform
+            .apply(static_cast<MathT>(pt[0]), static_cast<MathT>(pt[1]), static_cast<MathT>(pt[2]))
+            .floor();
     const int32_t base = eidx * 8;
-    #pragma unroll
+#pragma unroll
     for (int di = 0; di <= 1; di += 1) {
-        #pragma unroll
+#pragma unroll
         for (int dj = 0; dj <= 1; dj += 1) {
-            #pragma unroll
+#pragma unroll
             for (int dk = 0; dk <= 1; dk += 1) {
-                const nanovdb::Coord ijk = ijk0 + nanovdb::Coord(di, dj, dk);
-                const int32_t count = di * 4 + dj * 2 + dk;
+                const nanovdb::Coord ijk    = ijk0 + nanovdb::Coord(di, dj, dk);
+                const int32_t        count  = di * 4 + dj * 2 + dk;
                 outIJKData[base + count][0] = ijk[0];
                 outIJKData[base + count][1] = ijk[1];
                 outIJKData[base + count][2] = ijk[2];
-                outIJKBIdx[base + count] = bidx;
+                outIJKBIdx[base + count]    = bidx;
             }
         }
     }
 }
 
-
-
-
 template <typename GridType>
-__device__ void ijkForGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                        const GridBatchImpl::Accessor<GridType> batchAcc,
-                                        const nanovdb::Coord bmin, const nanovdb::Coord bmax,
-                                        TorchRAcc64<int32_t, 2> outIJKData,
-                                        TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-
-    const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
-    const int32_t totalPadAmount = dims[0] * dims[1] * dims[2];
-
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const int64_t totalVoxels = gridPtr->activeVoxelCount();
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
+__device__ void
+ijkForGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                        const GridBatchImpl::Accessor<GridType> batchAcc, const nanovdb::Coord bmin,
+                        const nanovdb::Coord bmax, TorchRAcc64<int32_t, 2> outIJKData,
+                        TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
+    const nanovdb::Coord dims           = bmax - bmin + nanovdb::Coord(1);
+    const int32_t        totalPadAmount = dims[0] * dims[1] * dims[2];
+
+    const nanovdb::NanoGrid<GridType> *gridPtr     = batchAcc.grid(bidx);
+    const int64_t                      totalVoxels = gridPtr->activeVoxelCount();
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
     const int64_t baseOffset = batchAcc.voxelOffset(bidx);
 
     if (leaf.isActive(vidx)) {
-        const int64_t value = ((int64_t) leaf.getValue(vidx)) - 1;
-        const int64_t base = (baseOffset + value) * totalPadAmount;
-        const nanovdb::Coord ijk0 = leaf.offsetToGlobalCoord(vidx);
+        const int64_t        value = ((int64_t)leaf.getValue(vidx)) - 1;
+        const int64_t        base  = (baseOffset + value) * totalPadAmount;
+        const nanovdb::Coord ijk0  = leaf.offsetToGlobalCoord(vidx);
         copyCoords(bidx, base, ijk0, bmin, bmax, outIJKData, outIJKBIdx);
     }
 }
 
-
 template <typename GridType>
-__device__ void ijkForGridVoxelCallbackWithoutBorder(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                                     const GridBatchImpl::Accessor<GridType> batchAcc,
-                                                     const nanovdb::Coord bmin, const nanovdb::Coord bmax,
-                                                     const TorchRAcc64<int64_t, 1> packInfoBase,
-                                                     TorchRAcc64<int32_t, 2> outIJKData,
-                                                     TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-
+__device__ void
+ijkForGridVoxelCallbackWithoutBorder(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                     const GridBatchImpl::Accessor<GridType> batchAcc,
+                                     const nanovdb::Coord bmin, const nanovdb::Coord bmax,
+                                     const TorchRAcc64<int64_t, 1>  packInfoBase,
+                                     TorchRAcc64<int32_t, 2>        outIJKData,
+                                     TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
     const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
 
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const auto gridAccessor = gridPtr->getAccessor();
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
+    const nanovdb::NanoGrid<GridType>                        *gridPtr      = batchAcc.grid(bidx);
+    const auto                                                gridAccessor = gridPtr->getAccessor();
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
     const int64_t baseOffset = batchAcc.voxelOffset(bidx);
 
     if (leaf.isActive(vidx)) {
-        const int64_t value = ((int64_t) leaf.getValue(vidx)) - 1;
-        const int64_t base = baseOffset + value;
-        const nanovdb::Coord ijk0 = leaf.offsetToGlobalCoord(vidx);
-        copyCoordsWithoutBorder<GridType>(gridAccessor, bidx, base, ijk0, bmin, bmax, packInfoBase, outIJKData, outIJKBIdx);
+        const int64_t        value = ((int64_t)leaf.getValue(vidx)) - 1;
+        const int64_t        base  = baseOffset + value;
+        const nanovdb::Coord ijk0  = leaf.offsetToGlobalCoord(vidx);
+        copyCoordsWithoutBorder<GridType>(gridAccessor, bidx, base, ijk0, bmin, bmax, packInfoBase,
+                                          outIJKData, outIJKBIdx);
     }
 }
 
 template <typename GridType>
-__device__ void ijkForGridVoxelCallbackWithoutBorderCount(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                                          const GridBatchImpl::Accessor<GridType> batchAcc,
-                                                          const nanovdb::Coord bmin, const nanovdb::Coord bmax,
-                                                          TorchRAcc64<int64_t, 1> outCounter) {
-
+__device__ void
+ijkForGridVoxelCallbackWithoutBorderCount(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                          const GridBatchImpl::Accessor<GridType> batchAcc,
+                                          const nanovdb::Coord bmin, const nanovdb::Coord bmax,
+                                          TorchRAcc64<int64_t, 1> outCounter) {
     const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
 
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const auto gridAccessor = gridPtr->getAccessor();
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
+    const nanovdb::NanoGrid<GridType>                        *gridPtr      = batchAcc.grid(bidx);
+    const auto                                                gridAccessor = gridPtr->getAccessor();
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
     const int64_t baseOffset = batchAcc.voxelOffset(bidx);
 
     if (leaf.isActive(vidx)) {
-        const int64_t value = ((int64_t) leaf.getValue(vidx)) - 1;
-        const int64_t base = baseOffset + value;
-        const nanovdb::Coord ijk0 = leaf.offsetToGlobalCoord(vidx);
+        const int64_t        value = ((int64_t)leaf.getValue(vidx)) - 1;
+        const int64_t        base  = baseOffset + value;
+        const nanovdb::Coord ijk0  = leaf.offsetToGlobalCoord(vidx);
         countCoordsWithoutBorder<GridType>(gridAccessor, bidx, base, ijk0, bmin, bmax, outCounter);
     }
 }
 
-
 template <typename GridType>
-__device__ void coarseIjkForFineGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                                  const GridBatchImpl::Accessor<GridType> batchAcc,
-                                                  nanovdb::Coord coarseningFactor,
-                                                  TorchRAcc64<int32_t, 2> outIJKData,
-                                                  TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
+__device__ void
+coarseIjkForFineGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                  const GridBatchImpl::Accessor<GridType> batchAcc,
+                                  nanovdb::Coord                          coarseningFactor,
+                                  TorchRAcc64<int32_t, 2>                 outIJKData,
+                                  TorchRAcc64<fvdb::JIdxType, 1>          outIJKBIdx) {
+    const nanovdb::NanoGrid<GridType>                        *gridPtr = batchAcc.grid(bidx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
     const int64_t baseOffset = batchAcc.voxelOffset(bidx);
 
     if (leaf.isActive(vidx)) {
-        const int64_t value = ((int64_t) leaf.getValue(vidx)) - 1;
-        const int64_t index = (baseOffset + value);
-        const nanovdb::Coord fineIjk = leaf.offsetToGlobalCoord(vidx);
+        const int64_t        value     = ((int64_t)leaf.getValue(vidx)) - 1;
+        const int64_t        index     = (baseOffset + value);
+        const nanovdb::Coord fineIjk   = leaf.offsetToGlobalCoord(vidx);
         const nanovdb::Coord coarseIjk = (fineIjk.asVec3d() / coarseningFactor.asVec3d()).floor();
-        outIJKData[index][0] = coarseIjk[0];
-        outIJKData[index][1] = coarseIjk[1];
-        outIJKData[index][2] = coarseIjk[2];
-        outIJKBIdx[index] = bidx;
+        outIJKData[index][0]           = coarseIjk[0];
+        outIJKData[index][1]           = coarseIjk[1];
+        outIJKData[index][2]           = coarseIjk[2];
+        outIJKBIdx[index]              = bidx;
     }
 }
 
-
 template <typename GridType>
-__device__ void fineIjkForCoarseGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                                                  const GridBatchImpl::Accessor<GridType> batchAcc,
-                                                  nanovdb::Coord upsamplingFactor,
-                                                  TorchRAcc64<int32_t, 2> outIJKData,
-                                                  TorchRAcc64<fvdb::JIdxType, 1> outIJKBIdx) {
-
-    const nanovdb::NanoGrid<GridType>* gridPtr = batchAcc.grid(bidx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = gridPtr->tree().template getFirstNode<0>()[lidx];
-    const int64_t baseOffset = batchAcc.voxelOffset(bidx);
+__device__ void
+fineIjkForCoarseGridVoxelCallback(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                  const GridBatchImpl::Accessor<GridType> batchAcc,
+                                  nanovdb::Coord                          upsamplingFactor,
+                                  TorchRAcc64<int32_t, 2>                 outIJKData,
+                                  TorchRAcc64<fvdb::JIdxType, 1>          outIJKBIdx) {
+    const nanovdb::NanoGrid<GridType>                        *gridPtr = batchAcc.grid(bidx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        gridPtr->tree().template getFirstNode<0>()[lidx];
+    const int64_t baseOffset     = batchAcc.voxelOffset(bidx);
     const int64_t totalPadAmount = upsamplingFactor[0] * upsamplingFactor[1] * upsamplingFactor[2];
     if (leaf.isActive(vidx)) {
-        const int64_t value = ((int64_t) leaf.getValue(vidx)) - 1;
-        const int64_t index = (baseOffset + value) * totalPadAmount;
+        const int64_t        value     = ((int64_t)leaf.getValue(vidx)) - 1;
+        const int64_t        index     = (baseOffset + value) * totalPadAmount;
         const nanovdb::Coord coarseIjk = leaf.offsetToGlobalCoord(vidx);
-        const nanovdb::Coord fineIjk(coarseIjk[0] * upsamplingFactor[0], coarseIjk[1] * upsamplingFactor[1], coarseIjk[2] * upsamplingFactor[2]);
+        const nanovdb::Coord fineIjk(coarseIjk[0] * upsamplingFactor[0],
+                                     coarseIjk[1] * upsamplingFactor[1],
+                                     coarseIjk[2] * upsamplingFactor[2]);
         copyCoords(bidx, index, upsamplingFactor, fineIjk, outIJKData, outIJKBIdx);
     }
 }
 
-
-
 template <>
-JaggedTensor dispatchFineIJKForCoarseGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                        nanovdb::Coord upsamplingFactor,
-                                                        const torch::optional<JaggedTensor>& maybeMask) {
+JaggedTensor
+dispatchFineIJKForCoarseGrid<torch::kCUDA>(const GridBatchImpl                 &batchHdl,
+                                           nanovdb::Coord                       upsamplingFactor,
+                                           const torch::optional<JaggedTensor> &maybeMask) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "GridBatchImpl must have a valid index");
 
     const int64_t totalPadAmount = upsamplingFactor[0] * upsamplingFactor[1] * upsamplingFactor[2];
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-    torch::Tensor outIJK = torch::empty({batchHdl.totalVoxels() * totalPadAmount, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({batchHdl.totalVoxels() * totalPadAmount}, optsBIdx); // TODO: Don't populate for single batch
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+    torch::Tensor outIJK = torch::empty({ batchHdl.totalVoxels() * totalPadAmount, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ batchHdl.totalVoxels() * totalPadAmount },
+                                            optsBIdx); // TODO: Don't populate for single batch
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> bacc) {
-            fineIjkForCoarseGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc, upsamplingFactor, outIJKAcc, outIJKBIdxAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 GridBatchImpl::Accessor<GridType> bacc) {
+            fineIjkForCoarseGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc,
+                                                        upsamplingFactor, outIJKAcc, outIJKBIdxAcc);
         };
 
         forEachVoxelCUDA<GridType>(1024, 1, batchHdl, cb);
     });
 
-    // FIXME: (Francis) this uses a bunch of extra memory. Maybe we can avoid it in the future by allowing for
-    // invalid values in the nanovdb GPU grid building process
+    // FIXME: (Francis) this uses a bunch of extra memory. Maybe we can avoid it in the future by
+    // allowing for invalid values in the nanovdb GPU grid building process
     if (maybeMask.has_value()) {
         std::vector<torch::Tensor> stack;
         stack.reserve(totalPadAmount);
         for (int i = 0; i < totalPadAmount; ++i) {
             stack.push_back(maybeMask.value().jdata());
         }
-        torch::Tensor mask = torch::stack(stack, 1).view({-1});
-        outIJK = outIJK.index({mask}).contiguous();
-        outIJKBIdx = outIJKBIdx.index({mask}).contiguous();
-        return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(), batchHdl.batchSize());
+        torch::Tensor mask = torch::stack(stack, 1).view({ -1 });
+        outIJK             = outIJK.index({ mask }).contiguous();
+        outIJKBIdx         = outIJKBIdx.index({ mask }).contiguous();
+        return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(),
+                                                            batchHdl.batchSize());
     }
 
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, batchHdl.voxelOffsets(true) * totalPadAmount, batchHdl.jlidx(true));
+    return JaggedTensor::from_data_offsets_and_list_ids(
+        outIJK, batchHdl.voxelOffsets(true) * totalPadAmount, batchHdl.jlidx(true));
 }
 
-
 template <>
-JaggedTensor dispatchCoarseIJKForFineGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                        nanovdb::Coord coarseningFactor) {
+JaggedTensor
+dispatchCoarseIJKForFineGrid<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                           nanovdb::Coord       coarseningFactor) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "GridBatchImpl must have a valid index");
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-    torch::Tensor outIJK = torch::empty({batchHdl.totalVoxels(), 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({batchHdl.totalVoxels()}, optsBIdx);  // TODO: Don't populate for single batch
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+    torch::Tensor outIJK = torch::empty({ batchHdl.totalVoxels(), 3 }, optsData);
+    torch::Tensor outIJKBIdx =
+        torch::empty({ batchHdl.totalVoxels() }, optsBIdx); // TODO: Don't populate for single batch
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> bacc) {
-            coarseIjkForFineGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc, coarseningFactor, outIJKAcc, outIJKBIdxAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 GridBatchImpl::Accessor<GridType> bacc) {
+            coarseIjkForFineGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc,
+                                                        coarseningFactor, outIJKAcc, outIJKBIdxAcc);
         };
 
         forEachVoxelCUDA<GridType>(1024, 1, batchHdl, cb);
     });
 
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, batchHdl.voxelOffsets(true), batchHdl.jlidx(true));
+    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, batchHdl.voxelOffsets(true),
+                                                        batchHdl.jlidx(true));
 }
 
-
 template <>
-JaggedTensor dispatchPaddedIJKForGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                    const nanovdb::Coord& bmin,
-                                                    const nanovdb::Coord& bmax) {
+JaggedTensor
+dispatchPaddedIJKForGrid<torch::kCUDA>(const GridBatchImpl &batchHdl, const nanovdb::Coord &bmin,
+                                       const nanovdb::Coord &bmax) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "GridBatchImpl must have a valid index");
 
-    const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
-    const int32_t totalPadAmount = dims[0] * dims[1] * dims[2];
+    const nanovdb::Coord dims           = bmax - bmin + nanovdb::Coord(1);
+    const int32_t        totalPadAmount = dims[0] * dims[1] * dims[2];
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-    torch::Tensor outIJK = torch::empty({batchHdl.totalVoxels() * totalPadAmount, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({batchHdl.totalVoxels() * totalPadAmount}, optsBIdx);  // TODO: Don't populate for single batch
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+    torch::Tensor outIJK = torch::empty({ batchHdl.totalVoxels() * totalPadAmount, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ batchHdl.totalVoxels() * totalPadAmount },
+                                            optsBIdx); // TODO: Don't populate for single batch
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> bacc) {
-            ijkForGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc, bmin, bmax, outIJKAcc, outIJKBIdxAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 GridBatchImpl::Accessor<GridType> bacc) {
+            ijkForGridVoxelCallback<GridType>(bidx, lidx, vidx, cidx, bacc, bmin, bmax, outIJKAcc,
+                                              outIJKBIdxAcc);
         };
         forEachVoxelCUDA<GridType>(1024, 1, batchHdl, cb);
     });
 
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, batchHdl.voxelOffsets(true) * totalPadAmount, batchHdl.jlidx(true));
+    return JaggedTensor::from_data_offsets_and_list_ids(
+        outIJK, batchHdl.voxelOffsets(true) * totalPadAmount, batchHdl.jlidx(true));
 }
 
-
 template <>
-JaggedTensor dispatchPaddedIJKForGridWithoutBorder<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                                 const nanovdb::Coord& bmin,
-                                                                 const nanovdb::Coord& bmax) {
+JaggedTensor
+dispatchPaddedIJKForGridWithoutBorder<torch::kCUDA>(const GridBatchImpl  &batchHdl,
+                                                    const nanovdb::Coord &bmin,
+                                                    const nanovdb::Coord &bmax) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "GridBatchImpl must have a valid index");
 
     const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-    const torch::TensorOptions optsCounter = torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+    const torch::TensorOptions optsCounter =
+        torch::TensorOptions().dtype(torch::kInt64).device(batchHdl.device());
 
-    torch::Tensor outCounter = torch::empty({batchHdl.totalVoxels()}, optsCounter);
+    torch::Tensor outCounter = torch::empty({ batchHdl.totalVoxels() }, optsCounter);
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outCounterAcc = outCounter.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>();
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> bacc) {
-            ijkForGridVoxelCallbackWithoutBorderCount<GridType>(bidx, lidx, vidx, cidx, bacc, bmin, bmax, outCounterAcc);
+        auto cb            = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                            GridBatchImpl::Accessor<GridType> bacc) {
+            ijkForGridVoxelCallbackWithoutBorderCount<GridType>(bidx, lidx, vidx, cidx, bacc, bmin,
+                                                                           bmax, outCounterAcc);
         };
         forEachVoxelCUDA<GridType>(512, 1, batchHdl, cb);
     });
-    torch::Tensor cumCounts = torch::cumsum(outCounter, 0);
-    int64_t numVoxels = cumCounts[-1].item<int64_t>();
+    torch::Tensor cumCounts    = torch::cumsum(outCounter, 0);
+    int64_t       numVoxels    = cumCounts[-1].item<int64_t>();
     torch::Tensor packInfoBase = cumCounts - outCounter;
 
-    torch::Tensor outIJK = torch::empty({numVoxels, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({numVoxels}, optsBIdx);  // TODO: Don't populate for single batch
+    torch::Tensor outIJK = torch::empty({ numVoxels, 3 }, optsData);
+    torch::Tensor outIJKBIdx =
+        torch::empty({ numVoxels }, optsBIdx); // TODO: Don't populate for single batch
     if (numVoxels == 0) {
         // TODO(ruilong): Shall we raise error? Do we support empty grid?
-        return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(), batchHdl.batchSize());
+        return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(),
+                                                            batchHdl.batchSize());
     }
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
-        auto packInfoBaseAcc = packInfoBase.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>();
-
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, GridBatchImpl::Accessor<GridType> bacc) {
-            ijkForGridVoxelCallbackWithoutBorder<GridType>(bidx, lidx, vidx, cidx, bacc, bmin, bmax, packInfoBaseAcc, outIJKAcc, outIJKBIdxAcc);
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto packInfoBaseAcc =
+            packInfoBase.packed_accessor64<int64_t, 1, torch::RestrictPtrTraits>();
+
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 GridBatchImpl::Accessor<GridType> bacc) {
+            ijkForGridVoxelCallbackWithoutBorder<GridType>(bidx, lidx, vidx, cidx, bacc, bmin, bmax,
+                                                           packInfoBaseAcc, outIJKAcc,
+                                                           outIJKBIdxAcc);
         };
         forEachVoxelCUDA<GridType>(512, 1, batchHdl, cb);
     });
 
-    return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(), batchHdl.batchSize());
+    return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(),
+                                                        batchHdl.batchSize());
 }
 
-
 template <>
-JaggedTensor dispatchPaddedIJKForPoints<torch::kCUDA>(const JaggedTensor& jaggedPoints,
-                                                      const nanovdb::Coord& bmin,
-                                                      const nanovdb::Coord& bmax,
-                                                      const std::vector<VoxelCoordTransform>& transforms) {
+JaggedTensor
+dispatchPaddedIJKForPoints<torch::kCUDA>(const JaggedTensor   &jaggedPoints,
+                                         const nanovdb::Coord &bmin, const nanovdb::Coord &bmax,
+                                         const std::vector<VoxelCoordTransform> &transforms) {
     TORCH_CHECK(jaggedPoints.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(jaggedPoints.device().has_index(), "GridBatchImpl must have a valid index");
 
-    const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
-    const int32_t totalPadAmount = dims[0] * dims[1] * dims[2];
+    const nanovdb::Coord dims           = bmax - bmin + nanovdb::Coord(1);
+    const int32_t        totalPadAmount = dims[0] * dims[1] * dims[2];
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(jaggedPoints.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedPoints.device());
-    torch::Tensor outIJK = torch::empty({jaggedPoints.jdata().size(0) * totalPadAmount, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({jaggedPoints.jdata().size(0) * totalPadAmount}, optsBIdx);  // TODO: Don't populate for single batch
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(jaggedPoints.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedPoints.device());
+    torch::Tensor outIJK =
+        torch::empty({ jaggedPoints.jdata().size(0) * totalPadAmount, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ jaggedPoints.jdata().size(0) * totalPadAmount },
+                                            optsBIdx); // TODO: Don't populate for single batch
 
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(jaggedPoints.scalar_type(), "paddedIJKForPoints", [&] {
-        RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(), jaggedPoints.device());
-        transformsDVec.setData((VoxelCoordTransform*) transforms.data(), true /* blocking */);
-        const VoxelCoordTransform* transformDevPtr = transformsDVec.devicePtr;
+        RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(),
+                                                                jaggedPoints.device());
+        transformsDVec.setData((VoxelCoordTransform *)transforms.data(), true /* blocking */);
+        const VoxelCoordTransform *transformDevPtr = transformsDVec.devicePtr;
 
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pacc) {
-            paddedIJKForPointsCallback(bidx, eidx, pacc, transformDevPtr, totalPadAmount, bmin, bmax, outIJKAcc, outIJKBIdxAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                 JaggedRAcc32<scalar_t, 2> pacc) {
+            paddedIJKForPointsCallback(bidx, eidx, pacc, transformDevPtr, totalPadAmount, bmin,
+                                       bmax, outIJKAcc, outIJKBIdxAcc);
         };
         forEachJaggedElementChannelCUDA<scalar_t, 2>(1024, 1, jaggedPoints, cb);
     });
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, jaggedPoints.joffsets() * totalPadAmount, jaggedPoints.jlidx());
+    return JaggedTensor::from_data_offsets_and_list_ids(
+        outIJK, jaggedPoints.joffsets() * totalPadAmount, jaggedPoints.jlidx());
 }
 
-
 template <>
-JaggedTensor dispatchPaddedIJKForCoords<torch::kCUDA>(const JaggedTensor& jaggedCoords,
-                                                      const nanovdb::Coord& bmin,
-                                                      const nanovdb::Coord& bmax) {
+JaggedTensor
+dispatchPaddedIJKForCoords<torch::kCUDA>(const JaggedTensor   &jaggedCoords,
+                                         const nanovdb::Coord &bmin, const nanovdb::Coord &bmax) {
     TORCH_CHECK(jaggedCoords.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(jaggedCoords.device().has_index(), "GridBatchImpl must have a valid index");
 
-    const nanovdb::Coord dims = bmax - bmin + nanovdb::Coord(1);
-    const int32_t totalPadAmount = dims[0] * dims[1] * dims[2];
+    const nanovdb::Coord dims           = bmax - bmin + nanovdb::Coord(1);
+    const int32_t        totalPadAmount = dims[0] * dims[1] * dims[2];
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(jaggedCoords.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedCoords.device());
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(jaggedCoords.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedCoords.device());
 
-    torch::Tensor outIJK = torch::empty({jaggedCoords.jdata().size(0) * totalPadAmount, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({jaggedCoords.jdata().size(0) * totalPadAmount}, optsBIdx);  // TODO: Don't populate for single batch
+    torch::Tensor outIJK =
+        torch::empty({ jaggedCoords.jdata().size(0) * totalPadAmount, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ jaggedCoords.jdata().size(0) * totalPadAmount },
+                                            optsBIdx); // TODO: Don't populate for single batch
 
     AT_DISPATCH_INTEGRAL_TYPES(jaggedCoords.scalar_type(), "paddedIJKForCoords", [&] {
         auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> cacc) {
-            paddedIJKForCoordsCallback(bidx, eidx, cacc, totalPadAmount, bmin, bmax, outIJKAcc, outIJKBIdxAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                 JaggedRAcc32<scalar_t, 2> cacc) {
+            paddedIJKForCoordsCallback(bidx, eidx, cacc, totalPadAmount, bmin, bmax, outIJKAcc,
+                                       outIJKBIdxAcc);
         };
 
         forEachJaggedElementChannelCUDA<scalar_t, 2>(256, 1, jaggedCoords, cb);
     });
 
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, jaggedCoords.joffsets() * totalPadAmount, jaggedCoords.jlidx());
+    return JaggedTensor::from_data_offsets_and_list_ids(
+        outIJK, jaggedCoords.joffsets() * totalPadAmount, jaggedCoords.jlidx());
 }
 
-
 template <>
-JaggedTensor dispatchNearestNeighborIJKForPoints<torch::kCUDA>(const JaggedTensor& jaggedPoints,
-                                                               const std::vector<VoxelCoordTransform>& transforms) {
+JaggedTensor
+dispatchNearestNeighborIJKForPoints<torch::kCUDA>(
+    const JaggedTensor &jaggedPoints, const std::vector<VoxelCoordTransform> &transforms) {
     TORCH_CHECK(jaggedPoints.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(jaggedPoints.device().has_index(), "GridBatchImpl must have a valid index");
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(jaggedPoints.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedPoints.device());
-    torch::Tensor outIJK = torch::empty({jaggedPoints.jdata().size(0) * 8, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({jaggedPoints.jdata().size(0) * 8}, optsBIdx);  // TODO: Don't populate for single batch
-
-    AT_DISPATCH_FLOATING_TYPES_AND_HALF(jaggedPoints.scalar_type(), "nearestNeighborIJKForPoints", [&] {
-        RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(), jaggedPoints.device());
-        transformsDVec.setData((VoxelCoordTransform*) transforms.data(), true /* blocking */);
-        const VoxelCoordTransform* transformDevPtr = transformsDVec.devicePtr;
-
-        auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
-
-        auto cb = [=] __device__ (int32_t bidx, int32_t eidx, int32_t cidx, JaggedRAcc32<scalar_t, 2> pacc) {
-            nearestNeighborIJKForPointCallback(bidx, eidx, pacc, transformDevPtr, outIJKAcc, outIJKBIdxAcc);
-        };
-
-        forEachJaggedElementChannelCUDA<scalar_t, 2>(256, 1, jaggedPoints, cb);
-    });
-
-    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, jaggedPoints.joffsets() * 8, jaggedPoints.jlidx());
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(jaggedPoints.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(jaggedPoints.device());
+    torch::Tensor outIJK     = torch::empty({ jaggedPoints.jdata().size(0) * 8, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ jaggedPoints.jdata().size(0) * 8 },
+                                            optsBIdx); // TODO: Don't populate for single batch
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        jaggedPoints.scalar_type(), "nearestNeighborIJKForPoints", [&] {
+            RAIIRawDeviceBuffer<VoxelCoordTransform> transformsDVec(transforms.size(),
+                                                                    jaggedPoints.device());
+            transformsDVec.setData((VoxelCoordTransform *)transforms.data(), true /* blocking */);
+            const VoxelCoordTransform *transformDevPtr = transformsDVec.devicePtr;
+
+            auto outIJKAcc = outIJK.packed_accessor64<int32_t, 2, torch::RestrictPtrTraits>();
+            auto outIJKBIdxAcc =
+                outIJKBIdx.packed_accessor64<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+
+            auto cb = [=] __device__(int32_t bidx, int32_t eidx, int32_t cidx,
+                                     JaggedRAcc32<scalar_t, 2> pacc) {
+                nearestNeighborIJKForPointCallback(bidx, eidx, pacc, transformDevPtr, outIJKAcc,
+                                                   outIJKBIdxAcc);
+            };
+
+            forEachJaggedElementChannelCUDA<scalar_t, 2>(256, 1, jaggedPoints, cb);
+        });
+
+    return JaggedTensor::from_data_offsets_and_list_ids(outIJK, jaggedPoints.joffsets() * 8,
+                                                        jaggedPoints.jlidx());
 }
 
 template <>
-JaggedTensor dispatchConvIJKForGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                  const nanovdb::Coord& kernelSize,
-                                                  const nanovdb::Coord& stride) {
+JaggedTensor
+dispatchConvIJKForGrid<torch::kCUDA>(const GridBatchImpl  &batchHdl,
+                                     const nanovdb::Coord &kernelSize,
+                                     const nanovdb::Coord &stride) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "GridBatchImpl must be on CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "GridBatchImpl must have a valid index");
 
@@ -587,34 +636,38 @@ JaggedTensor dispatchConvIJKForGrid<torch::kCUDA>(const GridBatchImpl& batchHdl,
 
     const int32_t kernelVolume = kernelSize.x() * kernelSize.y() * kernelSize.z();
 
-    const torch::TensorOptions optsData = torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
-    const torch::TensorOptions optsBIdx = torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
-    const torch::TensorOptions optsMask = torch::TensorOptions().dtype(torch::kBool).device(batchHdl.device());
-    torch::Tensor outIJK = torch::empty({batchHdl.totalVoxels() * kernelVolume, 3}, optsData);
-    torch::Tensor outIJKBIdx = torch::empty({batchHdl.totalVoxels() * kernelVolume}, optsBIdx);
-    torch::Tensor outMask = torch::zeros({batchHdl.totalVoxels() * kernelVolume}, optsMask);
+    const torch::TensorOptions optsData =
+        torch::TensorOptions().dtype(torch::kInt32).device(batchHdl.device());
+    const torch::TensorOptions optsBIdx =
+        torch::TensorOptions().dtype(fvdb::JIdxScalarType).device(batchHdl.device());
+    const torch::TensorOptions optsMask =
+        torch::TensorOptions().dtype(torch::kBool).device(batchHdl.device());
+    torch::Tensor outIJK     = torch::empty({ batchHdl.totalVoxels() * kernelVolume, 3 }, optsData);
+    torch::Tensor outIJKBIdx = torch::empty({ batchHdl.totalVoxels() * kernelVolume }, optsBIdx);
+    torch::Tensor outMask    = torch::zeros({ batchHdl.totalVoxels() * kernelVolume }, optsMask);
 
     // For each voxel in source grid, compute possible voxels in target grid that affect them
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&] {
         auto outIJKAcc = outIJK.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>();
-        auto outIJKBIdxAcc = outIJKBIdx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
+        auto outIJKBIdxAcc =
+            outIJKBIdx.packed_accessor32<fvdb::JIdxType, 1, torch::RestrictPtrTraits>();
         auto outMaskAcc = outMask.packed_accessor32<bool, 1, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
-                GridBatchImpl::Accessor<GridType> bacc) {
-            convIjkForGridCallback<GridType>(
-                    bidx, lidx, vidx, cidx, bacc, kernelSize, stride, kernelVolume,
-                    outIJKAcc, outIJKBIdxAcc, outMaskAcc);
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 GridBatchImpl::Accessor<GridType> bacc) {
+            convIjkForGridCallback<GridType>(bidx, lidx, vidx, cidx, bacc, kernelSize, stride,
+                                             kernelVolume, outIJKAcc, outIJKBIdxAcc, outMaskAcc);
         };
         forEachVoxelCUDA<GridType>(256, 1, batchHdl, cb);
     });
 
-    outIJK = outIJK.index({outMask});
-    outIJKBIdx = outIJKBIdx.index({outMask});
+    outIJK     = outIJK.index({ outMask });
+    outIJKBIdx = outIJKBIdx.index({ outMask });
 
-    return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(), batchHdl.batchSize());
+    return JaggedTensor::from_data_indices_and_list_ids(outIJK, outIJKBIdx, batchHdl.jlidx(),
+                                                        batchHdl.batchSize());
 }
 
 } // namespace ops
-} // namesapce detail
+} // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/ConvOps.h b/fvdb/src/detail/ops/convolution/backend/ConvOps.h
index 88f1c89508..1b31e0d72c 100644
--- a/fvdb/src/detail/ops/convolution/backend/ConvOps.h
+++ b/fvdb/src/detail/ops/convolution/backend/ConvOps.h
@@ -1,94 +1,83 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_OPS_CONVOLUTION_BACKEND_CONVOPS_H
+#define FVDB_DETAIL_OPS_CONVOLUTION_BACKEND_CONVOPS_H
+
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
 
 #include <torch/extension.h>
-#include "detail/GridBatchImpl.h"
-#include "Types.h"
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-
 template <c10::DeviceType>
 void dispatchSparseConvolutionKernelMap(torch::Tensor inFeat, torch::Tensor outFeat,
                                         torch::Tensor kernel, torch::Tensor nbMap,
-                                        torch::Tensor nbSizes,
-                                        bool transpose,
+                                        torch::Tensor nbSizes, bool transpose,
                                         bool middleAcceleration);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionImplicitGEMM(torch::Tensor inFeat,
-                                                    torch::Tensor kernel,
-                                                    torch::Tensor outInMap,
-                                                    int numOutFeats, int numOutChannels,
-                                                    bool allow_tf32, bool allow_fp16);
+torch::Tensor dispatchSparseConvolutionImplicitGEMM(torch::Tensor inFeat, torch::Tensor kernel,
+                                                    torch::Tensor outInMap, int numOutFeats,
+                                                    int numOutChannels, bool allow_tf32,
+                                                    bool allow_fp16);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMSorted(
-    torch::Tensor inFeat, torch::Tensor kernel,
-    torch::Tensor outInMap, torch::Tensor reducedMask,
-    torch::Tensor reorderLoc,
-    int numOutFeats, int numOutChannels,
-    bool allow_tf32, bool allow_fp16);
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMSorted(torch::Tensor inFeat, torch::Tensor kernel,
+                                            torch::Tensor outInMap, torch::Tensor reducedMask,
+                                            torch::Tensor reorderLoc, int numOutFeats,
+                                            int numOutChannels, bool allow_tf32, bool allow_fp16);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMGrad(
-    torch::Tensor inFeat, torch::Tensor kernel,
-    torch::Tensor outInMap, const int splitKIters,
-    bool allow_tf32, bool allow_fp16);
+torch::Tensor dispatchSparseConvolutionImplicitGEMMGrad(torch::Tensor inFeat, torch::Tensor kernel,
+                                                        torch::Tensor outInMap,
+                                                        const int splitKIters, bool allow_tf32,
+                                                        bool allow_fp16);
 
 template <c10::DeviceType>
 torch::Tensor dispatchSparseConvolutionImplicitGEMMGradSorted(
-    torch::Tensor inFeat, torch::Tensor kernel,
-    torch::Tensor outInMap, torch::Tensor reducedMask,
-    torch::Tensor reorderLoc, const int splitKIters,
-    bool allow_tf32, bool allow_fp16);
-
-void dispatchMESparseConvolutionKernelMapGrad(at::Tensor in_feat,
-                                              at::Tensor grad_in_feat,
-                                              at::Tensor grad_out_feat,
-                                              at::Tensor kernel,
-                                              at::Tensor grad_kernel,
-                                              at::Tensor neighbor_map,
-                                              at::Tensor neighbor_offset,
-                                              const bool transpose);
+    torch::Tensor inFeat, torch::Tensor kernel, torch::Tensor outInMap, torch::Tensor reducedMask,
+    torch::Tensor reorderLoc, const int splitKIters, bool allow_tf32, bool allow_fp16);
+
+void dispatchMESparseConvolutionKernelMapGrad(at::Tensor in_feat, at::Tensor grad_in_feat,
+                                              at::Tensor grad_out_feat, at::Tensor kernel,
+                                              at::Tensor grad_kernel, at::Tensor neighbor_map,
+                                              at::Tensor neighbor_offset, const bool transpose);
 
 template <c10::DeviceType>
 void dispatchSparseConvolutionKernelMapGrad(torch::Tensor inFeat, torch::Tensor gradInFeat,
                                             torch::Tensor gradOutFeat, torch::Tensor kernel,
                                             torch::Tensor gradKernel, torch::Tensor nbMap,
-                                            torch::Tensor nbSizes,
-                                            bool transpose);
+                                            torch::Tensor nbSizes, bool transpose);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionHalo(const GridBatchImpl& grid,
-                                            const torch::Tensor& inFeat,
-                                            const torch::Tensor& kernel,
-                                            int variant);
+torch::Tensor dispatchSparseConvolutionHalo(const GridBatchImpl &grid, const torch::Tensor &inFeat,
+                                            const torch::Tensor &kernel, int variant);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionHaloGrad(const GridBatchImpl& batchHdl,
-                                                const torch::Tensor& inFeatures,
-                                                const torch::Tensor& gradOutFeatures);
+torch::Tensor dispatchSparseConvolutionHaloGrad(const GridBatchImpl &batchHdl,
+                                                const torch::Tensor &inFeatures,
+                                                const torch::Tensor &gradOutFeatures);
 
 template <c10::DeviceType>
-torch::Tensor dispatchSparseConvolutionCutlass(
-        const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-        const torch::Tensor& haloIndexBuffer, const torch::Tensor& outputIndexBuffer,
-        bool benchmark);
-
+torch::Tensor
+dispatchSparseConvolutionCutlass(const torch::Tensor &inFeatures, const torch::Tensor &kernel,
+                                 const torch::Tensor &haloIndexBuffer,
+                                 const torch::Tensor &outputIndexBuffer, bool benchmark);
 
 template <c10::DeviceType>
 torch::Tensor dispatchSparseConvolutionLggs(
-    const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-    const torch::Tensor& spokeIndicesFlattenedOffset,
-    const torch::Tensor& spokeInputGlobalIndicesFlattenedData,
-    const torch::Tensor& spokeOutputLocalOffsetsRelativeToBlockFlattenedData
-);
-
-}
-}
-}
\ No newline at end of file
+    const torch::Tensor &inFeatures, const torch::Tensor &kernel,
+    const torch::Tensor &spokeIndicesFlattenedOffset,
+    const torch::Tensor &spokeInputGlobalIndicesFlattenedData,
+    const torch::Tensor &spokeOutputLocalOffsetsRelativeToBlockFlattenedData);
+
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_OPS_CONVOLUTION_BACKEND_CONVOPS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/convolution/backend/MESparseConvolution.cu b/fvdb/src/detail/ops/convolution/backend/MESparseConvolution.cu
index 580c6445f6..34933407b7 100644
--- a/fvdb/src/detail/ops/convolution/backend/MESparseConvolution.cu
+++ b/fvdb/src/detail/ops/convolution/backend/MESparseConvolution.cu
@@ -4,240 +4,236 @@
 #ifndef GPU_CONVOLUTION
 #define GPU_CONVOLUTION
 
-#include <iostream>
-
-#include <torch/extension.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <ATen/cuda/Atomic.cuh>
 #include <ATen/OpMathType.h>
+#include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <torch/extension.h>
+#include <ATen/cuda/Atomic.cuh>
 
 #include <cublas_v2.h>
 
-#define CUDA_CHECK(condition)                                                  \
-  /* Code block avoids redefinition of cudaError_t error */                    \
-  {                                                                            \
-    cudaError_t error = condition;                                             \
-    if (error != cudaSuccess) {                                                \
-      throw std::runtime_error(cudaGetErrorString(error));                     \
-    }                                                                          \
-  }
-
-#define CUBLAS_CHECK(condition)                                                \
-  {                                                                            \
-    cublasStatus_t status = condition;                                         \
-    if (status != CUBLAS_STATUS_SUCCESS) {                                     \
-      throw std::runtime_error(cublasGetErrorString(status));                  \
-    }                                                                          \
-  }
+#include <iostream>
+
+#define CUDA_CHECK(condition)                                    \
+    /* Code block avoids redefinition of cudaError_t error */    \
+    {                                                            \
+        cudaError_t error = condition;                           \
+        if (error != cudaSuccess) {                              \
+            throw std::runtime_error(cudaGetErrorString(error)); \
+        }                                                        \
+    }
+
+#define CUBLAS_CHECK(condition)                                     \
+    {                                                               \
+        cublasStatus_t status = condition;                          \
+        if (status != CUBLAS_STATUS_SUCCESS) {                      \
+            throw std::runtime_error(cublasGetErrorString(status)); \
+        }                                                           \
+    }
 
 constexpr uint32_t MAX_GRID = 65535;
 
-inline int GET_BLOCKS(const uint32_t N, const uint32_t THREADS) {
-  return std::max((N + THREADS - 1) / THREADS, uint32_t(1));
+inline int
+GET_BLOCKS(const uint32_t N, const uint32_t THREADS) {
+    return std::max((N + THREADS - 1) / THREADS, uint32_t(1));
 }
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-const char* cublasGetErrorString(cublasStatus_t error) {
-  switch (error) {
-  case CUBLAS_STATUS_SUCCESS:
-    return "CUBLAS_STATUS_SUCCESS";
-  case CUBLAS_STATUS_NOT_INITIALIZED:
-    return "CUBLAS_STATUS_NOT_INITIALIZED";
-  case CUBLAS_STATUS_ALLOC_FAILED:
-    return "CUBLAS_STATUS_ALLOC_FAILED";
-  case CUBLAS_STATUS_INVALID_VALUE:
-    return "CUBLAS_STATUS_INVALID_VALUE";
-  case CUBLAS_STATUS_ARCH_MISMATCH:
-    return "CUBLAS_STATUS_ARCH_MISMATCH";
-  case CUBLAS_STATUS_MAPPING_ERROR:
-    return "CUBLAS_STATUS_MAPPING_ERROR";
-  case CUBLAS_STATUS_EXECUTION_FAILED:
-    return "CUBLAS_STATUS_EXECUTION_FAILED";
-  case CUBLAS_STATUS_INTERNAL_ERROR:
-    return "CUBLAS_STATUS_INTERNAL_ERROR";
+const char *
+cublasGetErrorString(cublasStatus_t error) {
+    switch (error) {
+    case CUBLAS_STATUS_SUCCESS:
+        return "CUBLAS_STATUS_SUCCESS";
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+        return "CUBLAS_STATUS_NOT_INITIALIZED";
+    case CUBLAS_STATUS_ALLOC_FAILED:
+        return "CUBLAS_STATUS_ALLOC_FAILED";
+    case CUBLAS_STATUS_INVALID_VALUE:
+        return "CUBLAS_STATUS_INVALID_VALUE";
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+        return "CUBLAS_STATUS_ARCH_MISMATCH";
+    case CUBLAS_STATUS_MAPPING_ERROR:
+        return "CUBLAS_STATUS_MAPPING_ERROR";
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+        return "CUBLAS_STATUS_EXECUTION_FAILED";
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+        return "CUBLAS_STATUS_INTERNAL_ERROR";
 #if CUDA_VERSION >= 6000
-  case CUBLAS_STATUS_NOT_SUPPORTED:
-    return "CUBLAS_STATUS_NOT_SUPPORTED";
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+        return "CUBLAS_STATUS_NOT_SUPPORTED";
 #endif
 #if CUDA_VERSION >= 6050
-  case CUBLAS_STATUS_LICENSE_ERROR:
-    return "CUBLAS_STATUS_LICENSE_ERROR";
+    case CUBLAS_STATUS_LICENSE_ERROR:
+        return "CUBLAS_STATUS_LICENSE_ERROR";
 #endif
-  }
-  return "Unknown cublas status";
+    }
+    return "Unknown cublas status";
 }
 
 // TODO(cfujitsang): use at::cuda::blas::gemm
 template <typename Dtype>
-void gpu_gemm(const cublasOperation_t transA, const cublasOperation_t transB,
-              const int M, const int N, const int K, const at::opmath_type<Dtype> alpha,
-              const Dtype *A, const Dtype *B, const at::opmath_type<Dtype> beta, Dtype *C);
+void gpu_gemm(const cublasOperation_t transA, const cublasOperation_t transB, const int M,
+              const int N, const int K, const at::opmath_type<Dtype> alpha, const Dtype *A,
+              const Dtype *B, const at::opmath_type<Dtype> beta, Dtype *C);
 
 template <>
-void gpu_gemm<at::Half>(const cublasOperation_t transA, const cublasOperation_t transB,
-                        const int M, const int N, const int K,
-                        const float alpha, const at::Half *A,
-                        const at::Half *B, const float beta, at::Half *C) {
-  // Note that cublas follows (column-major) fortran order.
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-
-  int lda = (transA == CUBLAS_OP_N) ? K : M;
-  int ldb = (transB == CUBLAS_OP_N) ? N : K;
-  cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
-  if (!at::globalContext().allowFP16ReductionCuBLAS()) {
-    cublas_flags = static_cast<cublasMath_t>(cublas_flags | CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
-  }
-
-  CUBLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
-  CUBLAS_CHECK(cublasGemmEx(
-      handle, transB, transA, N, M, K, &alpha, B, CUDA_R_16F, ldb,
-      A, CUDA_R_16F, lda, &beta, C, CUDA_R_16F, N, CUDA_R_32F,
-      CUBLAS_GEMM_DEFAULT_TENSOR_OP));
+void
+gpu_gemm<at::Half>(const cublasOperation_t transA, const cublasOperation_t transB, const int M,
+                   const int N, const int K, const float alpha, const at::Half *A,
+                   const at::Half *B, const float beta, at::Half *C) {
+    // Note that cublas follows (column-major) fortran order.
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+
+    int          lda          = (transA == CUBLAS_OP_N) ? K : M;
+    int          ldb          = (transB == CUBLAS_OP_N) ? N : K;
+    cublasMath_t cublas_flags = CUBLAS_DEFAULT_MATH;
+    if (!at::globalContext().allowFP16ReductionCuBLAS()) {
+        cublas_flags = static_cast<cublasMath_t>(cublas_flags |
+                                                 CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION);
+    }
+
+    CUBLAS_CHECK(cublasSetMathMode(handle, cublas_flags));
+    CUBLAS_CHECK(cublasGemmEx(handle, transB, transA, N, M, K, &alpha, B, CUDA_R_16F, ldb, A,
+                              CUDA_R_16F, lda, &beta, C, CUDA_R_16F, N, CUDA_R_32F,
+                              CUBLAS_GEMM_DEFAULT_TENSOR_OP));
 }
 
 template <>
-void gpu_gemm<float>(const cublasOperation_t transA, const cublasOperation_t transB,
-                     const int M, const int N, const int K, const float alpha, const float *A,
-                     const float *B, const float beta, float *C) {
-  // Note that cublas follows (column-major) fortran order.
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-
-  int lda = (transA == CUBLAS_OP_N) ? K : M;
-  int ldb = (transB == CUBLAS_OP_N) ? N : K;
-  CUBLAS_CHECK(cublasSgemm(handle, transB, transA, N, M, K, &alpha, B, ldb,
-                           A, lda, &beta, C, N));
+void
+gpu_gemm<float>(const cublasOperation_t transA, const cublasOperation_t transB, const int M,
+                const int N, const int K, const float alpha, const float *A, const float *B,
+                const float beta, float *C) {
+    // Note that cublas follows (column-major) fortran order.
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+
+    int lda = (transA == CUBLAS_OP_N) ? K : M;
+    int ldb = (transB == CUBLAS_OP_N) ? N : K;
+    CUBLAS_CHECK(cublasSgemm(handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
 template <>
-void gpu_gemm<double>(const cublasOperation_t transA, const cublasOperation_t transB,
-                      const int M, const int N, const int K, const double alpha, const double *A,
-                      const double *B, const double beta, double *C) {
-  // Note that cublas follows fortran order.
-  cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
-
-  int lda = (transA == CUBLAS_OP_N) ? K : M;
-  int ldb = (transB == CUBLAS_OP_N) ? N : K;
-  CUBLAS_CHECK(cublasDgemm(handle, transB, transA, N, M, K, &alpha, B, ldb,
-                           A, lda, &beta, C, N));
+void
+gpu_gemm<double>(const cublasOperation_t transA, const cublasOperation_t transB, const int M,
+                 const int N, const int K, const double alpha, const double *A, const double *B,
+                 const double beta, double *C) {
+    // Note that cublas follows fortran order.
+    cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle();
+
+    int lda = (transA == CUBLAS_OP_N) ? K : M;
+    int ldb = (transB == CUBLAS_OP_N) ? N : K;
+    CUBLAS_CHECK(cublasDgemm(handle, transB, transA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N));
 }
 
 template <typename Dtype, typename Itype>
-__global__ void __shared_copy_kernel_map(Dtype *__restrict__ dst,
-                                         const Dtype *__restrict__ const src,
-                                         const Itype *__restrict__ const map,
-                                         const Itype nthreads,
-                                         const Itype length) {
-  // cchoy: cache map and benchmark.
-  extern __shared__ unsigned int smap[];
-  const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-  const Itype src_index = i / length;
-  const Itype length_index = i % length;
-  const Itype block_rem = (blockIdx.x * blockDim.x) % length;
-  const Itype smap_index = (threadIdx.x + block_rem) / length;
-  if ((threadIdx.x == 0 || (threadIdx.x + block_rem) % length == 0) &&
-      i < nthreads)
-    smap[smap_index] = map[src_index];
-  __syncthreads();
-  if (i < nthreads) {
-    dst[i] = src[smap[smap_index] * length + length_index];
-  }
+__global__ void
+__shared_copy_kernel_map(Dtype *__restrict__ dst, const Dtype *__restrict__ const src,
+                         const Itype *__restrict__ const map, const Itype nthreads,
+                         const Itype length) {
+    // cchoy: cache map and benchmark.
+    extern __shared__ unsigned int smap[];
+    const unsigned int             i            = blockIdx.x * blockDim.x + threadIdx.x;
+    const Itype                    src_index    = i / length;
+    const Itype                    length_index = i % length;
+    const Itype                    block_rem    = (blockIdx.x * blockDim.x) % length;
+    const Itype                    smap_index   = (threadIdx.x + block_rem) / length;
+    if ((threadIdx.x == 0 || (threadIdx.x + block_rem) % length == 0) && i < nthreads)
+        smap[smap_index] = map[src_index];
+    __syncthreads();
+    if (i < nthreads) {
+        dst[i] = src[smap[smap_index] * length + length_index];
+    }
 }
 
-
 template <typename Dtype, typename Itype>
-void shared_copy_kernel_map(Dtype *dst, const Dtype *const src,
-                            const Itype *const map, const Itype nthreads,
-                            const Itype length) {
-  constexpr Itype MAX_THREADS = 512;
-  if (MAX_THREADS >= length) {
-    __shared_copy_kernel_map<Dtype, Itype>
-        <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
-           GET_BLOCKS(MAX_THREADS, length) * sizeof(unsigned int)>>>(
-            dst, src, map, nthreads, length);
-  } else {
-    __shared_copy_kernel_map<Dtype, Itype>
-        <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
-           GET_BLOCKS(length, MAX_THREADS) * sizeof(unsigned int)>>>(
-            dst, src, map, nthreads, length);
-  }
+void
+shared_copy_kernel_map(Dtype *dst, const Dtype *const src, const Itype *const map,
+                       const Itype nthreads, const Itype length) {
+    constexpr Itype MAX_THREADS = 512;
+    if (MAX_THREADS >= length) {
+        __shared_copy_kernel_map<Dtype, Itype>
+            <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
+               GET_BLOCKS(MAX_THREADS, length) * sizeof(unsigned int)>>>(dst, src, map, nthreads,
+                                                                         length);
+    } else {
+        __shared_copy_kernel_map<Dtype, Itype>
+            <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
+               GET_BLOCKS(length, MAX_THREADS) * sizeof(unsigned int)>>>(dst, src, map, nthreads,
+                                                                         length);
+    }
 }
 
 template <typename Dtype, typename Itype>
 __global__ void
-__shared_accumulate_kernel_map(Dtype *__restrict__ dst,
-                               const Dtype *__restrict__ const src,
-                               const Itype *__restrict__ const map,
-                               const Itype nthreads, const Itype length) {
-  // cchoy: cache map and benchmark.
-  extern __shared__ unsigned int smap[];
-  const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-  const Itype src_index = i / length;
-  const Itype length_index = i % length;
-  const Itype block_rem = (blockIdx.x * blockDim.x) % length;
-  const Itype smap_index = (threadIdx.x + block_rem) / length;
-  if ((threadIdx.x == 0 || (threadIdx.x + block_rem) % length == 0) &&
-      i < nthreads)
-    smap[smap_index] = map[src_index];
-  __syncthreads();
-  if (i < nthreads)
-    atomicAdd(&dst[smap[smap_index] * length + length_index], src[i]);
+__shared_accumulate_kernel_map(Dtype *__restrict__ dst, const Dtype *__restrict__ const src,
+                               const Itype *__restrict__ const map, const Itype nthreads,
+                               const Itype length) {
+    // cchoy: cache map and benchmark.
+    extern __shared__ unsigned int smap[];
+    const unsigned int             i            = blockIdx.x * blockDim.x + threadIdx.x;
+    const Itype                    src_index    = i / length;
+    const Itype                    length_index = i % length;
+    const Itype                    block_rem    = (blockIdx.x * blockDim.x) % length;
+    const Itype                    smap_index   = (threadIdx.x + block_rem) / length;
+    if ((threadIdx.x == 0 || (threadIdx.x + block_rem) % length == 0) && i < nthreads)
+        smap[smap_index] = map[src_index];
+    __syncthreads();
+    if (i < nthreads)
+        atomicAdd(&dst[smap[smap_index] * length + length_index], src[i]);
 }
 
 template <typename Dtype, typename Itype>
-void shared_accumulate_kernel_map(Dtype *dst, const Dtype *const src,
-                                  const Itype *const map, const Itype nthreads,
-                                  const Itype length) {
-  constexpr Itype MAX_THREADS = 512;
-  if (MAX_THREADS >= length)
-    __shared_accumulate_kernel_map<Dtype, Itype>
-        <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
-           GET_BLOCKS(MAX_THREADS, length) * sizeof(unsigned int)>>>(
-            dst, src, map, nthreads, length);
-  else
-    __shared_accumulate_kernel_map<Dtype, Itype>
-        <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
-           GET_BLOCKS(length, MAX_THREADS) * sizeof(unsigned int)>>>(
-            dst, src, map, nthreads, length);
+void
+shared_accumulate_kernel_map(Dtype *dst, const Dtype *const src, const Itype *const map,
+                             const Itype nthreads, const Itype length) {
+    constexpr Itype MAX_THREADS = 512;
+    if (MAX_THREADS >= length)
+        __shared_accumulate_kernel_map<Dtype, Itype>
+            <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
+               GET_BLOCKS(MAX_THREADS, length) * sizeof(unsigned int)>>>(dst, src, map, nthreads,
+                                                                         length);
+    else
+        __shared_accumulate_kernel_map<Dtype, Itype>
+            <<<GET_BLOCKS(nthreads, MAX_THREADS), MAX_THREADS,
+               GET_BLOCKS(length, MAX_THREADS) * sizeof(unsigned int)>>>(dst, src, map, nthreads,
+                                                                         length);
 }
 
 template <typename Dtype, typename Itype>
 __global__ void
-add_mapped_output_tr(const size_t n, const Dtype *__restrict__ in_feat,
-                     const size_t in_nchannel, Dtype *__restrict__ out_feat,
-                     const size_t out_nchannel, const Itype *__restrict__ map) {
-  extern __shared__ Itype map_index[];
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. y is for rows, x is for columns.
-  const int x = blockDim.x * bx + tx;
-  const int y = blockDim.y * by + ty;
-
-  if (x < n && ty == 0)
-    map_index[tx] = map[x];
-
-  __syncthreads();
-
-  if (x < n && y < out_nchannel) {
-    atomicAdd(&out_feat[map_index[tx] * out_nchannel + y],
-              in_feat[y * in_nchannel + x]);
-  }
+add_mapped_output_tr(const size_t n, const Dtype *__restrict__ in_feat, const size_t in_nchannel,
+                     Dtype *__restrict__ out_feat, const size_t out_nchannel,
+                     const Itype *__restrict__ map) {
+    extern __shared__ Itype map_index[];
+    // Block index
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
+
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+
+    // Coordinate. y is for rows, x is for columns.
+    const int x = blockDim.x * bx + tx;
+    const int y = blockDim.y * by + ty;
+
+    if (x < n && ty == 0)
+        map_index[tx] = map[x];
+
+    __syncthreads();
+
+    if (x < n && y < out_nchannel) {
+        atomicAdd(&out_feat[map_index[tx] * out_nchannel + y], in_feat[y * in_nchannel + x]);
+    }
 }
 
 namespace ConvolutionMode {
 enum Type {
-  DEFAULT,
-  DIRECT_GEMM,
-  COPY_GEMM,
+    DEFAULT,
+    DIRECT_GEMM,
+    COPY_GEMM,
 };
 }
 
@@ -245,68 +241,72 @@ namespace MinkowskiAlgorithm {
 enum Mode { DEFAULT = 0, MEMORY_EFFICIENT = 1, SPEED_OPTIMIZED = 2 };
 }
 
-bool check_direct_gemm_forward(MinkowskiAlgorithm::Mode const algo_index, //
-                               ConvolutionMode::Type const &convolution_mode,
-                               long const sA, long const sB, long const N) {
-  if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
-      (algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
-    return true;
-  if (convolution_mode == ConvolutionMode::COPY_GEMM)
-    return false;
-  if (sA == 32 && sB == 64 and N <= 490537) return true;
-  if (sB <= 40) {
-    if (sB <= 20) {
-      return true;
-    } else {
-      if (N <= 295625) {
+bool
+check_direct_gemm_forward(MinkowskiAlgorithm::Mode const algo_index, //
+                          ConvolutionMode::Type const &convolution_mode, long const sA,
+                          long const sB, long const N) {
+    if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
+        (algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
         return true;
-      } else {
-        return (sA <= 12);
-      }
-    }
-  } else {
-    if (sA <= 20)
-      return true;
-    else {
-      if (N <= 74556) {
-        return (sB <= 112);
-      } else {
+    if (convolution_mode == ConvolutionMode::COPY_GEMM)
         return false;
-      }
+    if (sA == 32 && sB == 64 and N <= 490537)
+        return true;
+    if (sB <= 40) {
+        if (sB <= 20) {
+            return true;
+        } else {
+            if (N <= 295625) {
+                return true;
+            } else {
+                return (sA <= 12);
+            }
+        }
+    } else {
+        if (sA <= 20)
+            return true;
+        else {
+            if (N <= 74556) {
+                return (sB <= 112);
+            } else {
+                return false;
+            }
+        }
     }
-  }
 }
 
-bool check_direct_gemm_backward(MinkowskiAlgorithm::Mode const algo_index, //
-                                ConvolutionMode::Type const &convolution_mode,
-                                long const sA, long const sB, long const N) {
-  if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
-      (algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
-    return true;
-  if (convolution_mode == ConvolutionMode::COPY_GEMM)
-    return false;
-  if (sA == 32 && sB == 64 and N <= 490537) return true;
-  if (sB <= 40) {
-    if (sA <= 20)
-      return true;
-    else {
-      if (N <= 490540) {
+bool
+check_direct_gemm_backward(MinkowskiAlgorithm::Mode const algo_index, //
+                           ConvolutionMode::Type const &convolution_mode, long const sA,
+                           long const sB, long const N) {
+    if ((convolution_mode == ConvolutionMode::DIRECT_GEMM) ||
+        (algo_index == MinkowskiAlgorithm::MEMORY_EFFICIENT))
         return true;
-      } else {
-        return (sA <= 12);
-      }
-    }
-  } else {
-    if (sA <= 20) {
-      return true;
-    } else {
-      if (N <= 30612) {
-        return (sB <= 160);
-      } else {
+    if (convolution_mode == ConvolutionMode::COPY_GEMM)
         return false;
-      }
+    if (sA == 32 && sB == 64 and N <= 490537)
+        return true;
+    if (sB <= 40) {
+        if (sA <= 20)
+            return true;
+        else {
+            if (N <= 490540) {
+                return true;
+            } else {
+                return (sA <= 12);
+            }
+        }
+    } else {
+        if (sA <= 20) {
+            return true;
+        } else {
+            if (N <= 30612) {
+                return (sB <= 160);
+            } else {
+                return false;
+            }
+        }
     }
-  }
 }
 
 /**
@@ -319,66 +319,66 @@ matmul(const Dtype *__restrict__ A, const int wA, const int hA, //
        const Dtype *__restrict__ B, const int wB, const int hB, //
        Dtype *__restrict__ C,                                   //
        const Itype *__restrict__ in_map, const Itype *__restrict__ out_map) {
-  // Use in_feat as A and kernel as B
-
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
-
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
-
-  // Coordinate. x is for rows, y is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
-
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
-
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Declaration of the shared memory array As used to
-    // store the sub-matrix of A
-    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Declaration of the shared memory array Bs used to
-    // store the sub-matrix of B
-    __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
-
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
-    Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
-
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
-
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
+    // Use in_feat as A and kernel as B
+
+    // Block index
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
+
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
+
+    // Coordinate. x is for rows, y is for columns.
+    const int x = BLOCK_SIZE * bx + tx;
+    const int y = BLOCK_SIZE * by + ty;
+
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    Dtype Csub = 0;
+
+    const Itype in_row  = y < hA ? in_map[y] : 0;
+    const Itype out_row = y < hA ? out_map[y] : 0;
+
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int s = 0; s < wA; s += BLOCK_SIZE) {
+        // Declaration of the shared memory array As used to
+        // store the sub-matrix of A
+        __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Declaration of the shared memory array Bs used to
+        // store the sub-matrix of B
+        __shared__ Dtype Bs[BLOCK_SIZE][BLOCK_SIZE];
+
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * in_row + s + tx] : 0;
+        Bs[ty][tx] = ((s + ty) < hB && x < wB) ? B[wB * (s + ty) + x] : 0;
+
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
+
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * Bs[k][tx];
-    }
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[ty][k] * Bs[k][tx];
+        }
 
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
-  }
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+    }
 
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < wB)
-    atomicAdd(&C[wB * out_row + x], Csub);
-  // C[wB * out_row + x] += Csub;
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    if (y < hA && x < wB)
+        atomicAdd(&C[wB * out_row + x], Csub);
+    // C[wB * out_row + x] += Csub;
 }
 
 /**
@@ -402,205 +402,205 @@ __global__ void
 matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
         const Dtype *__restrict__ B, const int wB, const int hB, //
         const Dtype *__restrict__ D, const int wD, const int hD, //
-        Dtype *__restrict__ C, Dtype *__restrict__ E,
-        const Itype *__restrict__ in_map, const Itype *__restrict__ out_map) {
-  // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
+        Dtype *__restrict__ C, Dtype *__restrict__ E, const Itype *__restrict__ in_map,
+        const Itype *__restrict__ out_map) {
+    // Use grad_out_feat as A, transposed kernel weight as B, and in_feat as D
 
-  // Block index
-  const int bx = blockIdx.x;
-  const int by = blockIdx.y;
+    // Block index
+    const int bx = blockIdx.x;
+    const int by = blockIdx.y;
 
-  // Thread index
-  const int tx = threadIdx.x;
-  const int ty = threadIdx.y;
+    // Thread index
+    const int tx = threadIdx.x;
+    const int ty = threadIdx.y;
 
-  // Coordinate. y is for rows, x is for columns.
-  const int x = BLOCK_SIZE * bx + tx;
-  const int y = BLOCK_SIZE * by + ty;
+    // Coordinate. y is for rows, x is for columns.
+    const int x = BLOCK_SIZE * bx + tx;
+    const int y = BLOCK_SIZE * by + ty;
 
-  const Itype in_row = y < hA ? in_map[y] : 0;
-  const Itype out_row = y < hA ? out_map[y] : 0;
+    const Itype in_row  = y < hA ? in_map[y] : 0;
+    const Itype out_row = y < hA ? out_map[y] : 0;
 
-  // Csub is used to store the element of the block sub-matrix
-  // that is computed by the thread
-  Dtype Csub = 0;
-  Dtype Esub = 0;
+    // Csub is used to store the element of the block sub-matrix
+    // that is computed by the thread
+    Dtype Csub = 0;
+    Dtype Esub = 0;
 
-  // Declaration of the shared memory array As used to
-  // store the sub-matrix of A
-  __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
+    // Declaration of the shared memory array As used to
+    // store the sub-matrix of A
+    __shared__ Dtype As[BLOCK_SIZE][BLOCK_SIZE];
 
-  // Declaration of the shared memory array Bs used to
-  // store the sub-matrix of B
-  __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
+    // Declaration of the shared memory array Bs used to
+    // store the sub-matrix of B
+    __shared__ Dtype BTs[BLOCK_SIZE][BLOCK_SIZE];
 
-  // Declaration of the shared memory array Ds used to
-  // store the sub-matrix of D
-  __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
+    // Declaration of the shared memory array Ds used to
+    // store the sub-matrix of D
+    __shared__ Dtype DTs[BLOCK_SIZE][BLOCK_SIZE];
 
-  // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
-  DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : static_cast<Dtype>(0.);
+    // For Ds = D^T[...:..., ...:...], use the transposed grid dimension for A
+    DTs[ty][tx] = (x < wD && y < hD) ? D[wD * in_row + x] : static_cast<Dtype>(0.);
 
-  // Loop over all the sub-matrices of A and B
-  // required to compute the block sub-matrix
-  for (int s = 0; s < wA; s += BLOCK_SIZE) {
-    // Load the matrices from device memory
-    // to shared memory; each thread loads
-    // one element of each matrix
-    As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : static_cast<Dtype>(0.);
+    // Loop over all the sub-matrices of A and B
+    // required to compute the block sub-matrix
+    for (int s = 0; s < wA; s += BLOCK_SIZE) {
+        // Load the matrices from device memory
+        // to shared memory; each thread loads
+        // one element of each matrix
+        As[ty][tx] = ((s + tx) < wA && y < hA) ? A[wA * out_row + s + tx] : static_cast<Dtype>(0.);
 
-    // Transposed kernel
-    BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : static_cast<Dtype>(0.);
+        // Transposed kernel
+        BTs[ty][tx] = ((s + ty) < wB && x < hB) ? B[wB * x + s + ty] : static_cast<Dtype>(0.);
 
-    // Synchronize to make sure the matrices are loaded
-    __syncthreads();
+        // Synchronize to make sure the matrices are loaded
+        __syncthreads();
 
-    // Multiply the two matrices together;
-    // each thread computes one element
-    // of the block sub-matrix
+        // Multiply the two matrices together;
+        // each thread computes one element
+        // of the block sub-matrix
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Csub += As[ty][k] * BTs[k][tx];
-    }
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Csub += As[ty][k] * BTs[k][tx];
+        }
 
-    // For Esub, reset to 0
-    Esub = 0;
+        // For Esub, reset to 0
+        Esub = 0;
 #pragma unroll
-    for (int k = 0; k < BLOCK_SIZE; ++k) {
-      Esub += DTs[k][ty] * As[k][tx];
-    }
+        for (int k = 0; k < BLOCK_SIZE; ++k) {
+            Esub += DTs[k][ty] * As[k][tx];
+        }
 
-    // Synchronize to make sure that the preceding
-    // computation is done before loading two new
-    // sub-matrices of A and B in the next iteration
-    __syncthreads();
+        // Synchronize to make sure that the preceding
+        // computation is done before loading two new
+        // sub-matrices of A and B in the next iteration
+        __syncthreads();
+
+        // For the E matrix which requires accmulation of multiple blocks, use
+        // atomic addition. This can be replaced with a more sophisticaed
+        // reduction algorithm.
+        if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
+            atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
+    }
 
-    // For the E matrix which requires accmulation of multiple blocks, use
-    // atomic addition. This can be replaced with a more sophisticaed
-    // reduction algorithm.
-    if ((bx * BLOCK_SIZE + ty) < wD && (s + tx) < wA)
-      atomicAdd(&E[wA * (bx * BLOCK_SIZE + ty) + (s + tx)], Esub);
-  }
-
-  // Write the block sub-matrix to device memory;
-  // each thread writes one element
-  if (y < hA && x < hB)
-    atomicAdd(&C[hB * in_row + x], Csub);
+    // Write the block sub-matrix to device memory;
+    // each thread writes one element
+    if (y < hA && x < hB)
+        atomicAdd(&C[hB * in_row + x], Csub);
 }
-//template <typename Dtype, typename Itype>
-//void ConvolutionForwardKernelGPU(
-//    const Dtype *d_in_feat,                      //
-//    const uint32_t in_nchannel,  //
-//    Dtype *d_out_feat,                           //
-//    const uint32_t out_nchannel, //
-//    Dtype *d_kernel,
-//    const gpu_kernel_map<Itype> &kernel_map,
-//    const uint32_t in_nrows,      //
-//    const uint32_t out_nrows,     //
-//    const MinkowskiAlgorithm::Mode algo_index,    //
-//    const ConvolutionMode::Type convolution_mode, //
-//    cublasHandle_t cuhandle, cudaStream_t stream) {
+// template <typename Dtype, typename Itype>
+// void ConvolutionForwardKernelGPU(
+//     const Dtype *d_in_feat,                      //
+//     const uint32_t in_nchannel,  //
+//     Dtype *d_out_feat,                           //
+//     const uint32_t out_nchannel, //
+//     Dtype *d_kernel,
+//     const gpu_kernel_map<Itype> &kernel_map,
+//     const uint32_t in_nrows,      //
+//     const uint32_t out_nrows,     //
+//     const MinkowskiAlgorithm::Mode algo_index,    //
+//     const ConvolutionMode::Type convolution_mode, //
+//     cublasHandle_t cuhandle, cudaStream_t stream) {
 //
-//  size_t n_active_in_volume, shared_mem_size;
+//   size_t n_active_in_volume, shared_mem_size;
 //
-//  if (check_direct_gemm_forward(algo_index, convolution_mode,
-//                                in_nchannel, out_nchannel, in_nrows)) {
-//    // Define the shared memory size
-//    if ((in_nchannel > 16 && out_nchannel > 16 &&
-//         in_nchannel * out_nchannel >= 512) ||
-//        (in_nchannel > 24 && out_nchannel > 24))
-//      shared_mem_size = 32;
-//    else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-//      shared_mem_size = 24;
-//    else if ((in_nchannel > 8 && out_nchannel > 8) ||
-//             (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-//      shared_mem_size = 16;
-//    else
-//      shared_mem_size = 8;
+//   if (check_direct_gemm_forward(algo_index, convolution_mode,
+//                                 in_nchannel, out_nchannel, in_nrows)) {
+//     // Define the shared memory size
+//     if ((in_nchannel > 16 && out_nchannel > 16 &&
+//          in_nchannel * out_nchannel >= 512) ||
+//         (in_nchannel > 24 && out_nchannel > 24))
+//       shared_mem_size = 32;
+//     else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
+//       shared_mem_size = 24;
+//     else if ((in_nchannel > 8 && out_nchannel > 8) ||
+//              (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
+//       shared_mem_size = 16;
+//     else
+//       shared_mem_size = 8;
 //
-//    dim3 threads(shared_mem_size, shared_mem_size);
+//     dim3 threads(shared_mem_size, shared_mem_size);
 //
-//    // Iterate through each spatial kernel and get indices for in_map and
-//    // out_map
-//    for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
-//      auto const k = it->first;
-//      n_active_in_volume = kernel_map.size(k);
-//      if (n_active_in_volume == 0)
-//        continue;
+//     // Iterate through each spatial kernel and get indices for in_map and
+//     // out_map
+//     for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
+//       auto const k = it->first;
+//       n_active_in_volume = kernel_map.size(k);
+//       if (n_active_in_volume == 0)
+//         continue;
 //
-//      size_t const num_grid =
-//          (n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
-//      size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-//      size_t const step = (n_active_in_volume + num_div - 1) / num_div;
+//       size_t const num_grid =
+//           (n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
+//       size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
+//       size_t const step = (n_active_in_volume + num_div - 1) / num_div;
 //
-//      for (size_t s = 0; s < num_div; s++) {
-//        size_t const offset = step * s;
-//        size_t const remainder = n_active_in_volume - offset;
-//        size_t const curr_num_active = remainder < step ? remainder : step;
-//        dim3 const grid((out_nchannel + threads.x - 1) / threads.x,
-//                        (curr_num_active + threads.y - 1) / threads.y);
+//       for (size_t s = 0; s < num_div; s++) {
+//         size_t const offset = step * s;
+//         size_t const remainder = n_active_in_volume - offset;
+//         size_t const curr_num_active = remainder < step ? remainder : step;
+//         dim3 const grid((out_nchannel + threads.x - 1) / threads.x,
+//                         (curr_num_active + threads.y - 1) / threads.y);
 //
-//        switch (shared_mem_size) {
-//        case 32:
-//          matmul<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
-//              d_in_feat, in_nchannel, curr_num_active,
-//              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-//              in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
-//              kernel_map.out_maps.begin(k) + offset);
-//          break;
-//        case 24:
-//          matmul<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
-//              d_in_feat, in_nchannel, curr_num_active,
-//              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-//              in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
-//              kernel_map.out_maps.begin(k) + offset);
-//          break;
-//        case 16:
-//          matmul<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
-//              d_in_feat, in_nchannel, curr_num_active,
-//              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-//              in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
-//              kernel_map.out_maps.begin(k) + offset);
-//          break;
-//        case 8:
-//          matmul<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
-//              d_in_feat, in_nchannel, curr_num_active,
-//              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-//              in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
-//              kernel_map.out_maps.begin(k) + offset);
-//          break;
-//        }
-//      }
-//      C10_CUDA_KERNEL_LAUNCH_CHECK();
-//    }
-//  } else { // copy gemm
-//    Itype const max_numel = kernel_map.max_size();
-//    Dtype *mapped_in_feat = reinterpret_cast<Dtype*>(
-//        c10::cuda::CUDACachingAllocator::raw_alloc(max_numel * in_nchannel * sizeof(Dtype)));
-//    Dtype *mapped_out_feat = reinterpret_cast<Dtype *>(
-//        c10::cuda::CUDACachingAllocator::raw_alloc(max_numel * out_nchannel * sizeof(Dtype)));
+//         switch (shared_mem_size) {
+//         case 32:
+//           matmul<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
+//               d_in_feat, in_nchannel, curr_num_active,
+//               &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+//               in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
+//               kernel_map.out_maps.begin(k) + offset);
+//           break;
+//         case 24:
+//           matmul<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
+//               d_in_feat, in_nchannel, curr_num_active,
+//               &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+//               in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
+//               kernel_map.out_maps.begin(k) + offset);
+//           break;
+//         case 16:
+//           matmul<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
+//               d_in_feat, in_nchannel, curr_num_active,
+//               &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+//               in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
+//               kernel_map.out_maps.begin(k) + offset);
+//           break;
+//         case 8:
+//           matmul<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
+//               d_in_feat, in_nchannel, curr_num_active,
+//               &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+//               in_nchannel, d_out_feat, kernel_map.in_maps.begin(k) + offset,
+//               kernel_map.out_maps.begin(k) + offset);
+//           break;
+//         }
+//       }
+//       C10_CUDA_KERNEL_LAUNCH_CHECK();
+//     }
+//   } else { // copy gemm
+//     Itype const max_numel = kernel_map.max_size();
+//     Dtype *mapped_in_feat = reinterpret_cast<Dtype*>(
+//         c10::cuda::CUDACachingAllocator::raw_alloc(max_numel * in_nchannel * sizeof(Dtype)));
+//     Dtype *mapped_out_feat = reinterpret_cast<Dtype *>(
+//         c10::cuda::CUDACachingAllocator::raw_alloc(max_numel * out_nchannel * sizeof(Dtype)));
 //
-//    for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
-//      auto const k = it->first;
-//      n_active_in_volume = kernel_map.size(k);
-//      if (n_active_in_volume == 0)
-//        continue;
+//     for (auto it = kernel_map.key_cbegin(); it != kernel_map.key_cend(); ++it) {
+//       auto const k = it->first;
+//       n_active_in_volume = kernel_map.size(k);
+//       if (n_active_in_volume == 0)
+//         continue;
 //
-//      shared_copy_kernel_map<Dtype, Itype>(
-//          // mapped_in_feat,
-//          mapped_in_feat, d_in_feat, kernel_map.in_maps.begin(k),
-//          n_active_in_volume * in_nchannel, in_nchannel);
+//       shared_copy_kernel_map<Dtype, Itype>(
+//           // mapped_in_feat,
+//           mapped_in_feat, d_in_feat, kernel_map.in_maps.begin(k),
+//           n_active_in_volume * in_nchannel, in_nchannel);
 //
-//      gpu_gemm<Dtype>(cuhandle, CUBLAS_OP_N, CUBLAS_OP_N,
-//                      n_active_in_volume,                        // M
-//                      out_nchannel,                              // N
-//                      in_nchannel,                               // K
-//                      1,                                         // alpha
-//                      mapped_in_feat,                            // A
-//                      &d_kernel[k * in_nchannel * out_nchannel], // B
-//                      0,                                         // beta
-//                      mapped_out_feat                            // C
-//      );
+//       gpu_gemm<Dtype>(cuhandle, CUBLAS_OP_N, CUBLAS_OP_N,
+//                       n_active_in_volume,                        // M
+//                       out_nchannel,                              // N
+//                       in_nchannel,                               // K
+//                       1,                                         // alpha
+//                       mapped_in_feat,                            // A
+//                       &d_kernel[k * in_nchannel * out_nchannel], // B
+//                       0,                                         // beta
+//                       mapped_out_feat                            // C
+//       );
 ///*
 //      int lda = K
 //      int ldb = N;
@@ -620,8 +620,8 @@ matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
 //  CUDA_CHECK(cudaStreamSynchronize(stream));
 //}
 //
-//template void
-//ConvolutionForwardKernelGPU<float, uint32_t>(
+// template void
+// ConvolutionForwardKernelGPU<float, uint32_t>(
 //    const float *d_in_feat,
 //    const uint32_t in_nchannel,
 //    float *d_out_feat,
@@ -635,8 +635,8 @@ matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
 //    cublasHandle_t cuhandle,
 //    cudaStream_t stream);
 //
-//template void
-//ConvolutionForwardKernelGPU<double, uint32_t>(
+// template void
+// ConvolutionForwardKernelGPU<double, uint32_t>(
 //    const double *d_in_feat,
 //    const uint32_t in_nchannel,
 //    double *d_out_feat,
@@ -650,201 +650,196 @@ matmul2(const Dtype *__restrict__ A, const int wA, const int hA, //
 //    cublasHandle_t cuhandle,
 //    cudaStream_t stream);
 
-
-void dispatchMESparseConvolutionKernelMapGrad(at::Tensor in_feat,
-                                              at::Tensor grad_in_feat,
-                                              at::Tensor grad_out_feat,
-                                              at::Tensor kernel,
-                                              at::Tensor grad_kernel,
-                                              at::Tensor neighbor_map,
-                                              at::Tensor neighbor_offset,
-                                              const bool transpose) {
-  grad_in_feat.resize_as_(in_feat);
-  grad_in_feat.zero_();
-  grad_kernel.resize_as_(kernel);
-  grad_kernel.zero_();
-  const auto full_in_map = neighbor_map.index({torch::indexing::Slice(), int(transpose)}).contiguous();
-  const auto full_out_map = neighbor_map.index({torch::indexing::Slice(), int(!transpose)}).contiguous();
-  bool is_half = in_feat.scalar_type() == at::ScalarType::Half;
-  int in_nrows = in_feat.size(0);
-  int in_nchannel = in_feat.size(1);
-  int out_nrows = grad_out_feat.size(0);
-  int out_nchannel = kernel.size(-1);
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream().stream();
-
-  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
-    in_feat.scalar_type(), "convolution_backward_cuda", ([&] {
-  //using scalar_t = float;
-  using Dtype = scalar_t;
-  using Itype = uint32_t;
-
-  const scalar_t *d_in_feat = in_feat.data_ptr<scalar_t>();
-  scalar_t *d_grad_in_feat = grad_in_feat.data_ptr<scalar_t>();
-  const scalar_t *d_grad_out_feat = grad_out_feat.data_ptr<scalar_t>();
-  scalar_t const *d_kernel = kernel.data_ptr<scalar_t>();
-  scalar_t* d_grad_kernel = grad_kernel.data_ptr<scalar_t>();
-
-
-  int kernel_volume = kernel.size(0);
-  size_t n_active_in_volume, shared_mem_size;
-  // Define the shared memory size
-  if ((in_nchannel > 16 && out_nchannel > 16 &&
-       in_nchannel * out_nchannel >= 512) ||
-      (in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
-    shared_mem_size = 32;
-  else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
-    shared_mem_size = 24;
-  else if ((in_nchannel > 8 && out_nchannel > 8) ||
-           (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
-    shared_mem_size = 16;
-  else
-    shared_mem_size = 8;
-  int cur_offset = 0;
-  if (!check_direct_gemm_backward(
-          MinkowskiAlgorithm::Mode::DEFAULT, ConvolutionMode::Type::DEFAULT,
-	  in_nchannel, out_nchannel, in_nrows)) {
-    // find max size
-    size_t max_active = *std::max_element(
-        neighbor_offset.data_ptr<int>(),
-        neighbor_offset.data_ptr<int>() + kernel_volume);
-
-    size_t in_buffer_size = max_active * in_nchannel * sizeof(scalar_t);
-    size_t out_buffer_size = max_active * out_nchannel * sizeof(scalar_t);
-    scalar_t *d_input_buffer = reinterpret_cast<scalar_t*>(
-        c10::cuda::CUDACachingAllocator::raw_alloc(in_buffer_size));
-    scalar_t *d_output_buffer = reinterpret_cast<scalar_t*>(
-        c10::cuda::CUDACachingAllocator::raw_alloc(out_buffer_size));
-
-    dim3 threads(32, shared_mem_size);
-    for (int k = 0; k < kernel_volume; ++k) {
-      n_active_in_volume = neighbor_offset.data_ptr<int>()[k];
-      if (n_active_in_volume == 0)
-        continue;
-      const Itype* d_in_map = reinterpret_cast<Itype*>(full_in_map.data_ptr<int>()) + cur_offset;
-      const Itype* d_out_map = reinterpret_cast<Itype*>(full_out_map.data_ptr<int>()) + cur_offset;
-      shared_copy_kernel_map<Dtype, Itype>(
-        d_output_buffer, d_grad_out_feat, d_out_map,
-        n_active_in_volume * out_nchannel, out_nchannel);
-      gpu_gemm<Dtype>(CUBLAS_OP_N, CUBLAS_OP_T,
-                      in_nchannel,                               // M
-                      n_active_in_volume,                        // N
-                      out_nchannel,                              // K
-                      1,                                         // alpha
-                      &d_kernel[k * in_nchannel * out_nchannel], // A
-                      d_output_buffer,                           // B
-                      0,                                         // beta
-                      d_input_buffer                             // C
-      );
-      // Accumulate gradients back to the input grad feat
-      // Put it back to the correct index
-      dim3 const grid_tr(GET_BLOCKS(n_active_in_volume, threads.x),
-                         GET_BLOCKS(in_nchannel, threads.y));
-      add_mapped_output_tr<Dtype, Itype>
-          <<<grid_tr, threads, threads.x * sizeof(Itype), stream>>>(
-              n_active_in_volume,
-              d_input_buffer,              // In
-              n_active_in_volume,          // In channel
-              d_grad_in_feat, in_nchannel, // Out
-              d_in_map);                   // Out channel
-
-      // Compute gradient for kernel
-      // Copy features to the buffer
-      dim3 const grid_in(GET_BLOCKS(n_active_in_volume, threads.x),
-                         GET_BLOCKS(in_nchannel, threads.y));
-      shared_copy_kernel_map<Dtype, Itype>(
-          d_input_buffer, d_in_feat, d_in_map, n_active_in_volume * in_nchannel,
-          in_nchannel);
-
-      CUDA_CHECK(cudaStreamSynchronize(stream));
-      gpu_gemm<Dtype>(CUBLAS_OP_T, CUBLAS_OP_N,
-                      in_nchannel,                                   // M
-                      out_nchannel,                                  // N
-                      n_active_in_volume,                            // K
-                      1,                                             // alpha
-                      d_input_buffer,                                // A
-                      d_output_buffer,                               // B
-                      1,                                             // beta
-                      &d_grad_kernel[k * in_nchannel * out_nchannel] // C
-      );
-      CUDA_CHECK(cudaStreamSynchronize(0));
-      cur_offset += n_active_in_volume;
-    }
-    c10::cuda::CUDACachingAllocator::raw_delete(d_input_buffer);
-    c10::cuda::CUDACachingAllocator::raw_delete(d_output_buffer);
-  } else {
-    dim3 threads(shared_mem_size, shared_mem_size);
-    for (int k = 0; k < kernel_volume; ++k) {
-      n_active_in_volume = neighbor_offset.data_ptr<int>()[k];
-      if (n_active_in_volume == 0)
-        continue;
-
-      size_t const num_grid =
-          (n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
-      size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
-      size_t const step = (n_active_in_volume + num_div - 1) / num_div;
-
-      const Itype* d_in_map = reinterpret_cast<Itype*>(full_in_map.data_ptr<int>()) + cur_offset;
-      const Itype* d_out_map = reinterpret_cast<Itype*>(full_out_map.data_ptr<int>()) + cur_offset;
-      for (int s = 0; s < num_div; s++) {
-        size_t const offset = step * s;
-        size_t const remainder = n_active_in_volume - offset;
-        size_t const curr_num_active = remainder < step ? remainder : step;
-        dim3 const grid((in_nchannel + threads.x - 1) / threads.x,
-                        (curr_num_active + threads.y - 1) / threads.y);
-        switch (shared_mem_size) {
-        case 32:
-          matmul2<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
-              d_grad_out_feat, out_nchannel, curr_num_active, // A
-              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-              in_nchannel,                                    // B
-              d_in_feat, in_nchannel, curr_num_active,        // D
-              d_grad_in_feat,                                 // C
-              &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-              d_in_map + offset,
-              d_out_map + offset);
-          break;
-        case 24:
-          matmul2<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
-              d_grad_out_feat, out_nchannel, curr_num_active, // A
-              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-              in_nchannel,                                    // B
-              d_in_feat, in_nchannel, curr_num_active,        // D
-              d_grad_in_feat,                                 // C
-              &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-              d_in_map + offset,
-              d_out_map + offset);
-          break;
-        case 16:
-          matmul2<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
-              d_grad_out_feat, out_nchannel, curr_num_active, // A
-              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-              in_nchannel,                                    // B
-              d_in_feat, in_nchannel, curr_num_active,        // D
-              d_grad_in_feat,                                 // C
-              &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-              d_in_map + offset,
-              d_out_map + offset);
-          break;
-        case 8:
-          matmul2<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
-              d_grad_out_feat, out_nchannel, curr_num_active, // A
-              &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
-              in_nchannel,                                    // B
-              d_in_feat, in_nchannel, curr_num_active,        // D
-              d_grad_in_feat,                                 // C
-              &d_grad_kernel[k * in_nchannel * out_nchannel], // E
-              d_in_map + offset,
-              d_out_map + offset);
-          break;
-        }
-      }
-      CUDA_CHECK(cudaGetLastError());
-      cur_offset += n_active_in_volume;
-    }
-    CUDA_CHECK(cudaStreamSynchronize(stream));
-  }
-
-  }));
+void
+dispatchMESparseConvolutionKernelMapGrad(at::Tensor in_feat, at::Tensor grad_in_feat,
+                                         at::Tensor grad_out_feat, at::Tensor kernel,
+                                         at::Tensor grad_kernel, at::Tensor neighbor_map,
+                                         at::Tensor neighbor_offset, const bool transpose) {
+    grad_in_feat.resize_as_(in_feat);
+    grad_in_feat.zero_();
+    grad_kernel.resize_as_(kernel);
+    grad_kernel.zero_();
+    const auto full_in_map =
+        neighbor_map.index({ torch::indexing::Slice(), int(transpose) }).contiguous();
+    const auto full_out_map =
+        neighbor_map.index({ torch::indexing::Slice(), int(!transpose) }).contiguous();
+    bool         is_half      = in_feat.scalar_type() == at::ScalarType::Half;
+    int          in_nrows     = in_feat.size(0);
+    int          in_nchannel  = in_feat.size(1);
+    int          out_nrows    = grad_out_feat.size(0);
+    int          out_nchannel = kernel.size(-1);
+    cudaStream_t stream       = at::cuda::getCurrentCUDAStream().stream();
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        in_feat.scalar_type(), "convolution_backward_cuda", ([&] {
+            // using scalar_t = float;
+            using Dtype = scalar_t;
+            using Itype = uint32_t;
+
+            const scalar_t *d_in_feat       = in_feat.data_ptr<scalar_t>();
+            scalar_t       *d_grad_in_feat  = grad_in_feat.data_ptr<scalar_t>();
+            const scalar_t *d_grad_out_feat = grad_out_feat.data_ptr<scalar_t>();
+            scalar_t const *d_kernel        = kernel.data_ptr<scalar_t>();
+            scalar_t       *d_grad_kernel   = grad_kernel.data_ptr<scalar_t>();
+
+            int    kernel_volume = kernel.size(0);
+            size_t n_active_in_volume, shared_mem_size;
+            // Define the shared memory size
+            if ((in_nchannel > 16 && out_nchannel > 16 && in_nchannel * out_nchannel >= 512) ||
+                (in_nchannel % 32 == 0 && out_nchannel % 32 == 0))
+                shared_mem_size = 32;
+            else if (in_nchannel % 24 == 0 && out_nchannel % 24 == 0)
+                shared_mem_size = 24;
+            else if ((in_nchannel > 8 && out_nchannel > 8) ||
+                     (in_nchannel % 16 == 0 && out_nchannel % 16 == 0))
+                shared_mem_size = 16;
+            else
+                shared_mem_size = 8;
+            int cur_offset = 0;
+            if (!check_direct_gemm_backward(MinkowskiAlgorithm::Mode::DEFAULT,
+                                            ConvolutionMode::Type::DEFAULT, in_nchannel,
+                                            out_nchannel, in_nrows)) {
+                // find max size
+                size_t max_active =
+                    *std::max_element(neighbor_offset.data_ptr<int>(),
+                                      neighbor_offset.data_ptr<int>() + kernel_volume);
+
+                size_t    in_buffer_size  = max_active * in_nchannel * sizeof(scalar_t);
+                size_t    out_buffer_size = max_active * out_nchannel * sizeof(scalar_t);
+                scalar_t *d_input_buffer  = reinterpret_cast<scalar_t *>(
+                    c10::cuda::CUDACachingAllocator::raw_alloc(in_buffer_size));
+                scalar_t *d_output_buffer = reinterpret_cast<scalar_t *>(
+                    c10::cuda::CUDACachingAllocator::raw_alloc(out_buffer_size));
+
+                dim3 threads(32, shared_mem_size);
+                for (int k = 0; k < kernel_volume; ++k) {
+                    n_active_in_volume = neighbor_offset.data_ptr<int>()[k];
+                    if (n_active_in_volume == 0)
+                        continue;
+                    const Itype *d_in_map =
+                        reinterpret_cast<Itype *>(full_in_map.data_ptr<int>()) + cur_offset;
+                    const Itype *d_out_map =
+                        reinterpret_cast<Itype *>(full_out_map.data_ptr<int>()) + cur_offset;
+                    shared_copy_kernel_map<Dtype, Itype>(
+                        d_output_buffer, d_grad_out_feat, d_out_map,
+                        n_active_in_volume * out_nchannel, out_nchannel);
+                    gpu_gemm<Dtype>(CUBLAS_OP_N, CUBLAS_OP_T,
+                                    in_nchannel,                               // M
+                                    n_active_in_volume,                        // N
+                                    out_nchannel,                              // K
+                                    1,                                         // alpha
+                                    &d_kernel[k * in_nchannel * out_nchannel], // A
+                                    d_output_buffer,                           // B
+                                    0,                                         // beta
+                                    d_input_buffer                             // C
+                    );
+                    // Accumulate gradients back to the input grad feat
+                    // Put it back to the correct index
+                    dim3 const grid_tr(GET_BLOCKS(n_active_in_volume, threads.x),
+                                       GET_BLOCKS(in_nchannel, threads.y));
+                    add_mapped_output_tr<Dtype, Itype>
+                        <<<grid_tr, threads, threads.x * sizeof(Itype), stream>>>(
+                            n_active_in_volume,
+                            d_input_buffer,              // In
+                            n_active_in_volume,          // In channel
+                            d_grad_in_feat, in_nchannel, // Out
+                            d_in_map);                   // Out channel
+
+                    // Compute gradient for kernel
+                    // Copy features to the buffer
+                    dim3 const grid_in(GET_BLOCKS(n_active_in_volume, threads.x),
+                                       GET_BLOCKS(in_nchannel, threads.y));
+                    shared_copy_kernel_map<Dtype, Itype>(d_input_buffer, d_in_feat, d_in_map,
+                                                         n_active_in_volume * in_nchannel,
+                                                         in_nchannel);
+
+                    CUDA_CHECK(cudaStreamSynchronize(stream));
+                    gpu_gemm<Dtype>(CUBLAS_OP_T, CUBLAS_OP_N,
+                                    in_nchannel,                                   // M
+                                    out_nchannel,                                  // N
+                                    n_active_in_volume,                            // K
+                                    1,                                             // alpha
+                                    d_input_buffer,                                // A
+                                    d_output_buffer,                               // B
+                                    1,                                             // beta
+                                    &d_grad_kernel[k * in_nchannel * out_nchannel] // C
+                    );
+                    CUDA_CHECK(cudaStreamSynchronize(0));
+                    cur_offset += n_active_in_volume;
+                }
+                c10::cuda::CUDACachingAllocator::raw_delete(d_input_buffer);
+                c10::cuda::CUDACachingAllocator::raw_delete(d_output_buffer);
+            } else {
+                dim3 threads(shared_mem_size, shared_mem_size);
+                for (int k = 0; k < kernel_volume; ++k) {
+                    n_active_in_volume = neighbor_offset.data_ptr<int>()[k];
+                    if (n_active_in_volume == 0)
+                        continue;
+
+                    size_t const num_grid =
+                        (n_active_in_volume + shared_mem_size - 1) / shared_mem_size;
+                    size_t const num_div = (num_grid + MAX_GRID - 1) / MAX_GRID;
+                    size_t const step    = (n_active_in_volume + num_div - 1) / num_div;
+
+                    const Itype *d_in_map =
+                        reinterpret_cast<Itype *>(full_in_map.data_ptr<int>()) + cur_offset;
+                    const Itype *d_out_map =
+                        reinterpret_cast<Itype *>(full_out_map.data_ptr<int>()) + cur_offset;
+                    for (int s = 0; s < num_div; s++) {
+                        size_t const offset          = step * s;
+                        size_t const remainder       = n_active_in_volume - offset;
+                        size_t const curr_num_active = remainder < step ? remainder : step;
+                        dim3 const   grid((in_nchannel + threads.x - 1) / threads.x,
+                                          (curr_num_active + threads.y - 1) / threads.y);
+                        switch (shared_mem_size) {
+                        case 32:
+                            matmul2<Dtype, Itype, 32><<<grid, threads, 0, stream>>>(
+                                d_grad_out_feat, out_nchannel, curr_num_active, // A
+                                &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+                                in_nchannel,                                    // B
+                                d_in_feat, in_nchannel, curr_num_active,        // D
+                                d_grad_in_feat,                                 // C
+                                &d_grad_kernel[k * in_nchannel * out_nchannel], // E
+                                d_in_map + offset, d_out_map + offset);
+                            break;
+                        case 24:
+                            matmul2<Dtype, Itype, 24><<<grid, threads, 0, stream>>>(
+                                d_grad_out_feat, out_nchannel, curr_num_active, // A
+                                &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+                                in_nchannel,                                    // B
+                                d_in_feat, in_nchannel, curr_num_active,        // D
+                                d_grad_in_feat,                                 // C
+                                &d_grad_kernel[k * in_nchannel * out_nchannel], // E
+                                d_in_map + offset, d_out_map + offset);
+                            break;
+                        case 16:
+                            matmul2<Dtype, Itype, 16><<<grid, threads, 0, stream>>>(
+                                d_grad_out_feat, out_nchannel, curr_num_active, // A
+                                &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+                                in_nchannel,                                    // B
+                                d_in_feat, in_nchannel, curr_num_active,        // D
+                                d_grad_in_feat,                                 // C
+                                &d_grad_kernel[k * in_nchannel * out_nchannel], // E
+                                d_in_map + offset, d_out_map + offset);
+                            break;
+                        case 8:
+                            matmul2<Dtype, Itype, 8><<<grid, threads, 0, stream>>>(
+                                d_grad_out_feat, out_nchannel, curr_num_active, // A
+                                &d_kernel[k * in_nchannel * out_nchannel], out_nchannel,
+                                in_nchannel,                                    // B
+                                d_in_feat, in_nchannel, curr_num_active,        // D
+                                d_grad_in_feat,                                 // C
+                                &d_grad_kernel[k * in_nchannel * out_nchannel], // E
+                                d_in_map + offset, d_out_map + offset);
+                            break;
+                        }
+                    }
+                    CUDA_CHECK(cudaGetLastError());
+                    cur_offset += n_active_in_volume;
+                }
+                CUDA_CHECK(cudaStreamSynchronize(stream));
+            }
+        }));
 }
 
 } // namespace ops
@@ -852,4 +847,3 @@ void dispatchMESparseConvolutionKernelMapGrad(at::Tensor in_feat,
 } // namespace fvdb
 
 #endif // end GPU_CONVOLUTION
-
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
index a063f39583..866973092c 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionCutlass.cu
@@ -1,189 +1,180 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include "ConvOps.h"
+
+#include <detail/utils/cuda/Utils.cuh>
+
+// NOTE: Getting an error about duplicate definitions of `copy_if` if cute/tenosr.hpp is included
+// after other cute headers
 #include <cute/tensor.hpp>
-#include <cute/atom/mma_atom.hpp>
+
 #include <cute/atom/copy_atom.hpp>
+#include <cute/atom/mma_atom.hpp>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
+
+#include <algorithm>
 
 namespace example {
 
 using namespace cute;
 
 // Empty type used to disable gather/scatter for a GEMM argument
-struct NoGather
-{
-  template<class... Ts>
-  NoGather(Ts...) {};
+struct NoGather {
+    template <class... Ts> NoGather(Ts...){};
 };
 
 /// Function object that applies an index to its argument
-template <class Index>
-struct IndexedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  IndexedGather(Index const *indices = {}): indices_(indices) {}
-
-  template <typename I>
-  CUTE_HOST_DEVICE constexpr
-  Index
-  operator()(I i) const { return indices_[i]; }
-
-  CUTE_HOST_DEVICE friend
-  void
-  print(IndexedGather const &s) {
-    print("Indexed");
-  }
-
-  Index const *indices_;
+template <class Index> struct IndexedGather {
+    CUTE_HOST_DEVICE constexpr IndexedGather(Index const *indices = {}) : indices_(indices) {}
+
+    template <typename I>
+    CUTE_HOST_DEVICE constexpr Index
+    operator()(I i) const {
+        return indices_[i];
+    }
+
+    CUTE_HOST_DEVICE friend void
+    print(IndexedGather const &s) {
+        print("Indexed");
+    }
+
+    Index const *indices_;
 };
 
 /// Function object that applies a stride to its argument
 /// Example: StridedFunc<int,_2> gathers every other row/column
-template <class Stride>
-struct StridedGather
-{
-  CUTE_HOST_DEVICE constexpr
-  StridedGather(Stride stride = {}): stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr
-  auto
-  operator()(I i) const { return i * stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void
-  print(StridedGather const &s) {
-    print("Strided{");
-    print(s.stride_);
-    print("}");
-  }
-
-  Stride stride_;
+template <class Stride> struct StridedGather {
+    CUTE_HOST_DEVICE constexpr StridedGather(Stride stride = {}) : stride_(stride) {}
+
+    template <class I>
+    CUTE_HOST_DEVICE constexpr auto
+    operator()(I i) const {
+        return i * stride_;
+    }
+
+    CUTE_HOST_DEVICE friend void
+    print(StridedGather const &s) {
+        print("Strided{");
+        print(s.stride_);
+        print("}");
+    }
+
+    Stride stride_;
 };
 
 /// Custom stride object that applies a function followed by a stride
-template <class Func, class Stride>
-struct CustomStride
-{
-  CUTE_HOST_DEVICE constexpr
-  CustomStride(Func const &func, Stride const &stride): func_(func), stride_(stride) {}
-
-  template <class I>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  operator*(I i, CustomStride const &s) { return s.func_(i) * s.stride_; }
-
-  CUTE_HOST_DEVICE friend
-  void
-  print(CustomStride const & s) {
-    print("Custom{");
-    print(s.func_);
-    print(",");
-    print(s.stride_);
-    print("}");
-  }
-
-  template<class Div>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  safe_div(CustomStride const &s, Div const &div)
-  {
-    return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_, safe_div(s.stride_, div));
-  }
-
-  // Circumvent the requirement on make_layout that shape and stride are integral
-  template <class Shape>
-  CUTE_HOST_DEVICE constexpr friend
-  auto
-  make_layout(Shape const &shape, CustomStride const &stride)
-  {
-    return Layout<Shape, CustomStride>(shape, stride);
-  }
-
-  Func func_;
-  Stride stride_;
+template <class Func, class Stride> struct CustomStride {
+    CUTE_HOST_DEVICE constexpr CustomStride(Func const &func, Stride const &stride)
+        : func_(func), stride_(stride) {}
+
+    template <class I>
+    CUTE_HOST_DEVICE constexpr friend auto
+    operator*(I i, CustomStride const &s) {
+        return s.func_(i) * s.stride_;
+    }
+
+    CUTE_HOST_DEVICE friend void
+    print(CustomStride const &s) {
+        print("Custom{");
+        print(s.func_);
+        print(",");
+        print(s.stride_);
+        print("}");
+    }
+
+    template <class Div>
+    CUTE_HOST_DEVICE constexpr friend auto
+    safe_div(CustomStride const &s, Div const &div) {
+        return CustomStride<Func, decltype(safe_div(s.stride_, div))>(s.func_,
+                                                                      safe_div(s.stride_, div));
+    }
+
+    // Circumvent the requirement on make_layout that shape and stride are integral
+    template <class Shape>
+    CUTE_HOST_DEVICE constexpr friend auto
+    make_layout(Shape const &shape, CustomStride const &stride) {
+        return Layout<Shape, CustomStride>(shape, stride);
+    }
+
+    Func   func_;
+    Stride stride_;
 };
 
-template<class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto
-make_custom_stride_layout(Stride const &stride, Func&& func)
-{
-  // Use a dummy shape and replace the first non-unit stride with a custom gather stride
-  auto idx = find_if(stride, [](auto x){ return not is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-  return make_layout(repeat_like(stride, _1{}),
-                     replace<I>(stride, CustomStride{static_cast<Func&&>(func), get<I>(stride)}));
+template <class Stride, class Func>
+CUTLASS_HOST_DEVICE auto
+make_custom_stride_layout(Stride const &stride, Func &&func) {
+    // Use a dummy shape and replace the first non-unit stride with a custom gather stride
+    auto          idx = find_if(stride, [](auto x) { return not is_constant<1, decltype(x)>{}; });
+    constexpr int I   = decltype(idx)::value;
+    return make_layout(
+        repeat_like(stride, _1{}),
+        replace<I>(stride, CustomStride{ static_cast<Func &&>(func), get<I>(stride) }));
 }
 
 /// Helper function to optionally create a gather tensor
-template<class Iterator, class Shape, class Stride, class Func>
-CUTLASS_HOST_DEVICE
-auto
-make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func)
-{
-  if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
-    Layout matrix_layout = make_identity_layout(shape);
-    auto offset = as_arithmetic_tuple(repeat_like(shape, _0{}));
-    Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func&&>(func));
-    return make_tensor(iter, ComposedLayout{gather_layout, offset, matrix_layout});
-  } else {
-    return make_tensor(iter, shape, stride);
-  }
+template <class Iterator, class Shape, class Stride, class Func>
+CUTLASS_HOST_DEVICE auto
+make_gather_tensor(Iterator iter, Shape const &shape, Stride const &stride, Func &&func) {
+    if constexpr (not cutlass::platform::is_same<remove_cvref_t<Func>, NoGather>::value) {
+        Layout matrix_layout = make_identity_layout(shape);
+        auto   offset        = as_arithmetic_tuple(repeat_like(shape, _0{}));
+        Layout gather_layout = make_custom_stride_layout(stride, static_cast<Func &&>(func));
+        return make_tensor(iter, ComposedLayout{ gather_layout, offset, matrix_layout });
+    } else {
+        return make_tensor(iter, shape, stride);
+    }
 }
 
 } // namespace example
 
-namespace cute
-{
-
-template<int N, int I, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(Shape const& shape, Stride const& stride)
-{
-  if constexpr (is_tuple<Shape>::value) {
-    return transform_layout(shape, stride, [](auto const& s, auto const& d) { return upcast<N,I>(s,d); });
-  } else if constexpr (is_scaled_basis<Stride>::value) {
-    if constexpr (Stride::mode() == I) {
-      return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
+namespace cute {
+
+template <int N, int I, class Shape, class Stride>
+CUTE_HOST_DEVICE constexpr auto
+upcast(Shape const &shape, Stride const &stride) {
+    if constexpr (is_tuple<Shape>::value) {
+        return transform_layout(shape, stride,
+                                [](auto const &s, auto const &d) { return upcast<N, I>(s, d); });
+    } else if constexpr (is_scaled_basis<Stride>::value) {
+        if constexpr (Stride::mode() == I) {
+            return make_layout(shape_div(shape, Int<N>{}), shape_div(stride, Int<N>{}));
+        } else {
+            return make_layout(shape, stride);
+        }
     } else {
-      return make_layout(shape, stride);
+        return upcast<N>(shape, stride);
     }
-  } else {
-    return upcast<N>(shape, stride);
-  }
 
-  CUTE_GCC_UNREACHABLE;
+    CUTE_GCC_UNREACHABLE;
 }
 
 template <int N, class OuterShape, class OuterStride, class Offset, class Shape, class Stride>
-CUTE_HOST_DEVICE constexpr
-auto
-upcast(ComposedLayout<Layout<OuterShape,OuterStride>,Offset,Layout<Shape,Stride>> const& layout)
-{
-  // Find index of the stride-1 mode - that is the only one that requires updating inner shape and offset
-  auto idx = find_if(layout.layout_a().stride(), [](auto x){ return is_constant<1, decltype(x)>{}; });
-  constexpr int I = decltype(idx)::value;
-
-  // Upcast the outer layout (works as expected)
-  auto outer = upcast<N>(layout.layout_a());
-
-  // Upcast the accumulated offset along stride-1 mode
-  auto offset = as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
-
-  // Upcast the inner layout's shape along stride-1 mode
-  auto inner = upcast<N,I>(layout.layout_b().shape(), layout.layout_b().stride());
-
-  return composition(outer, offset, inner);
+CUTE_HOST_DEVICE constexpr auto
+upcast(
+    ComposedLayout<Layout<OuterShape, OuterStride>, Offset, Layout<Shape, Stride>> const &layout) {
+    // Find index of the stride-1 mode - that is the only one that requires updating inner shape and
+    // offset
+    auto idx =
+        find_if(layout.layout_a().stride(), [](auto x) { return is_constant<1, decltype(x)>{}; });
+    constexpr int I = decltype(idx)::value;
+
+    // Upcast the outer layout (works as expected)
+    auto outer = upcast<N>(layout.layout_a());
+
+    // Upcast the accumulated offset along stride-1 mode
+    auto offset =
+        as_arithmetic_tuple(replace<I>(layout.offset(), upcast<N>(get<I>(layout.offset()))));
+
+    // Upcast the inner layout's shape along stride-1 mode
+    auto inner = upcast<N, I>(layout.layout_b().shape(), layout.layout_b().stride());
+
+    return composition(outer, offset, inner);
 }
 
-} // namespace example
+} // namespace cute
 
 namespace fvdb {
 namespace detail {
@@ -192,11 +183,10 @@ namespace ops {
 #if ((defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) || __cplusplus >= 202002L)
 
 using namespace cute;
-using example::IndexedGather;
 using example::CustomStride;
+using example::IndexedGather;
 
-template <typename IntDi, typename IntDo>
-struct KernelFunctorV1 {
+template <typename IntDi, typename IntDo> struct KernelFunctorV1 {
     //
     // Static config
     //
@@ -215,20 +205,21 @@ struct KernelFunctorV1 {
     using C = IntDi;
     using K = IntDo;
 
-    using Tiler_K = decltype(cute::min(K{}, _128{}));;
-    using Tiler_C = decltype(cute::min(C{}, _32{}));
-    using Tiler_N = _4;
-    using TileM = Tiler_K;
-    using TileN = Shape<Tiler_N, Z, P, Q>;
-    using TileK = Shape<Tiler_C,_1,_1,_1>;
-    using PIPE  = _3;
+    using Tiler_K = decltype(cute::min(K{}, _128{}));
+    ;
+    using Tiler_C  = decltype(cute::min(C{}, _32{}));
+    using Tiler_N  = _4;
+    using TileM    = Tiler_K;
+    using TileN    = Shape<Tiler_N, Z, P, Q>;
+    using TileK    = Shape<Tiler_C, _1, _1, _1>;
+    using PIPE     = _3;
     using TilerFlt = Shape<TileM, TileK>;
     using TilerAct = Shape<TileN, TileK>;
     using TilerOut = Shape<TileM, TileN>;
 
-    using TileSizeM = Int<size(TileM{})>;
-    using TileSizeN = Int<size(TileN{})>;
-    using TileSizeK = Int<size(TileK{})>;
+    using TileSizeM             = Int<size(TileM{})>;
+    using TileSizeN             = Int<size(TileN{})>;
+    using TileSizeK             = Int<size(TileK{})>;
     static constexpr int Stages = PIPE::value;
 
     // TODO: add rounding if input types are fp32 instead of tf32
@@ -236,12 +227,10 @@ struct KernelFunctorV1 {
     using ElementAct = tfloat32_t;
     using ElementOut = float;
 
-    using TiledMma = TiledMMA<
-        MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
-        Layout<Shape<_2,_2,_1>>,
-        Tile<_32,_32,Underscore>>;
+    using TiledMma = TiledMMA<MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>, Layout<Shape<_2, _2, _1>>,
+                              Tile<_32, _32, Underscore>>;
 
-    static constexpr int MaxThreadsPerBlock = size(TiledMma{});
+    static constexpr int MaxThreadsPerBlock         = size(TiledMma{});
     static constexpr int MinBlocksPerMultiprocessor = 1;
 
     union SharedStorage {
@@ -259,18 +248,15 @@ struct KernelFunctorV1 {
     // Stencil tensor
     //
 
-    using GmemLayoutFlt = decltype(make_ordered_layout(
-        Shape< K, Shape< C, T, R, S>>{},
-        tuple<_4, tuple<_0,_3,_2,_1>>{}));
+    using GmemLayoutFlt = decltype(make_ordered_layout(Shape<K, Shape<C, T, R, S>>{},
+                                                       tuple<_4, tuple<_0, _3, _2, _1>>{}));
 
     // We have 64 elements * 32b each in the major mode that we can vectorize
     // Max vector size is 128b, so lay 16 threads along the major mode with a vector size of 4
     // Rest along the minor mode
     using GmemTiledCopyFlt = decltype(make_tiled_copy(
         Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementFlt>{},
-        Layout<Shape <_16, _8>,
-               Stride< _8, _1>>{},
-        Layout<Shape < _1, _4>>{}));
+        Layout<Shape<_16, _8>, Stride<_8, _1>>{}, Layout<Shape<_1, _4>>{}));
 
     // using SmemLayoutFlt = decltype(
     //     composition(Swizzle<3,2,3>{},
@@ -278,12 +264,10 @@ struct KernelFunctorV1 {
     //                     Shape<TileSizeM,TileSizeK,PIPE>{},
     //                     tuple<       _1,       _0,  _2>{})));
 
-    using SmemLayoutFlt = decltype(
-        tile_to_shape(
-            composition(Swizzle<1,2,3>{},
-                        Layout<Shape <_8,Shape <_4, _2>>,
-                               Stride<_4,Stride<_1,_32>>>{}),
-            Shape<TileSizeM,TileSizeK,PIPE>{}));
+    using SmemLayoutFlt = decltype(tile_to_shape(
+        composition(Swizzle<1, 2, 3>{},
+                    Layout<Shape<_8, Shape<_4, _2>>, Stride<_4, Stride<_1, _32>>>{}),
+        Shape<TileSizeM, TileSizeK, PIPE>{}));
 
     using SmemCopyAtomFlt = Copy_Atom<SM75_U32x4_LDSM_N, ElementFlt>;
 
@@ -295,9 +279,7 @@ struct KernelFunctorV1 {
     // Then lay out the rest of the threads along the other mode
     using GmemTiledCopyAct = decltype(make_tiled_copy(
         Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<uint128_t>, ElementAct>{},
-        Layout<Shape <_16, _8>,
-               Stride< _8, _1>>{},
-        Layout<Shape < _1, _4>>{}));
+        Layout<Shape<_16, _8>, Stride<_8, _1>>{}, Layout<Shape<_1, _4>>{}));
 
     // Both Flt and Act are contraction major
     // using SmemLayoutAct = decltype(
@@ -306,12 +288,10 @@ struct KernelFunctorV1 {
     //                     Shape<TileSizeN,TileSizeK,PIPE>{},
     //                     tuple<       _1,       _0,  _2>{})));
 
-    using SmemLayoutAct = decltype(
-        tile_to_shape(
-            composition(Swizzle<1,2,3>{},
-                        Layout<Shape <_8,Shape <_4, _2>>,
-                               Stride<_4,Stride<_1,_32>>>{}),
-            Shape<TileSizeN,TileSizeK,PIPE>{}));
+    using SmemLayoutAct = decltype(tile_to_shape(
+        composition(Swizzle<1, 2, 3>{},
+                    Layout<Shape<_8, Shape<_4, _2>>, Stride<_4, Stride<_1, _32>>>{}),
+        Shape<TileSizeN, TileSizeK, PIPE>{}));
 
     using SmemCopyAtomAct = Copy_Atom<SM75_U32x4_LDSM_N, ElementAct>;
 
@@ -320,10 +300,8 @@ struct KernelFunctorV1 {
     //
 
     using GmemTiledCopyOut = decltype(make_tiled_copy(
-        Copy_Atom<UniversalCopy<uint128_t>, ElementAct>{},
-        Layout<Shape <_8, _16>,
-               Stride<_1,  _8>>{},
-        Layout<Shape <_4,  _1>>{}));
+        Copy_Atom<UniversalCopy<uint128_t>, ElementAct>{}, Layout<Shape<_8, _16>, Stride<_1, _8>>{},
+        Layout<Shape<_4, _1>>{}));
 
     using SmemCopyAtomOut = Copy_Atom<UniversalCopy<uint32_t>, ElementOut>;
 
@@ -338,80 +316,84 @@ struct KernelFunctorV1 {
     operator()(cute::Tensor<EngineFlt, GmemLayoutFlt> mFlt, // ( K,        (C,T,R,S))
                TensorActivation                       mAct, // ((N,Z,P,Q), (C,T,R,S))
                TensorOutput                           mOut, // ( K,        (N,Z,P,Q))
-               char* smem_buf) const {
+               char                                  *smem_buf) const {
         using namespace cute;
-        uint64_t start = clock64();
-        SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smem_buf);
+        uint64_t       start   = clock64();
+        SharedStorage &storage = *reinterpret_cast<SharedStorage *>(smem_buf);
         Tensor sA = make_tensor(make_smem_ptr(&storage.mainloop.sAMatrix[0]), SmemLayoutFlt{});
         Tensor sB = make_tensor(make_smem_ptr(&storage.mainloop.sBMatrix[0]), SmemLayoutAct{});
         Tensor sC = make_tensor(make_smem_ptr(&storage.epilogue.sCMatrix[0]), SmemLayoutOut{});
 
         TiledMma tiled_mma;
-        Tensor accum = partition_fragment_C(tiled_mma, TilerOut{});
+        Tensor   accum = partition_fragment_C(tiled_mma, TilerOut{});
         clear(accum);
         auto thr_mma = tiled_mma.get_thread_slice(threadIdx.x);
 
         // Set up tensors
-        // NOTE: blockIdx.x projects onto act-NDHW mode, y along the flt-K mode for the sake of higher dynamic range in NDHW
-        Tensor gA_mk = local_tile(mFlt, TilerFlt{}, make_coord(_,_));                              // (BLK_M,BLK_K,m',k')
-        Tensor gB_nk = local_tile(mAct, TilerAct{}, make_coord(_,_));                              // (BLK_N,BLK_K,n',_1)
-        Tensor gC_mn = local_tile(mOut, TilerOut{}, make_coord(_,_));                              // (BLK_M,BLK_N,m',n')
+        // NOTE: blockIdx.x projects onto act-NDHW mode, y along the flt-K mode for the sake of
+        // higher dynamic range in NDHW
+        Tensor gA_mk = local_tile(mFlt, TilerFlt{}, make_coord(_, _)); // (BLK_M,BLK_K,m',k')
+        Tensor gB_nk = local_tile(mAct, TilerAct{}, make_coord(_, _)); // (BLK_N,BLK_K,n',_1)
+        Tensor gC_mn = local_tile(mOut, TilerOut{}, make_coord(_, _)); // (BLK_M,BLK_N,m',n')
 
         // Compute m_coord and n_coord with their post-tiled shapes
-        auto m_coord = idx2crd(int(blockIdx.y), shape<2>(gA_mk));
-        auto n_coord = idx2crd(int(blockIdx.x), shape<2>(gB_nk));
-        Tensor gA = gA_mk(_,_,m_coord,_);                                                          // (BLK_M,BLK_K,k')
-        Tensor gB = gB_nk(_,_,n_coord,_);                                                          // (BLK_N,BLK_K,_1)
-        Tensor gC = gC_mn(_,_,m_coord,n_coord);                                                    // (BLK_M,BLK_N)
+        auto   m_coord = idx2crd(int(blockIdx.y), shape<2>(gA_mk));
+        auto   n_coord = idx2crd(int(blockIdx.x), shape<2>(gB_nk));
+        Tensor gA      = gA_mk(_, _, m_coord, _);       // (BLK_M,BLK_K,k')
+        Tensor gB      = gB_nk(_, _, n_coord, _);       // (BLK_N,BLK_K,_1)
+        Tensor gC      = gC_mn(_, _, m_coord, n_coord); // (BLK_M,BLK_N)
 
         GmemTiledCopyFlt gmem_tiled_copy_A;
-        auto gmem_thr_copy_A   = gmem_tiled_copy_A.get_slice(threadIdx.x);
-        Tensor tAgA            = gmem_thr_copy_A.partition_S(gA);                                  // (VEC,ACPY_M,ACPY_K,k')
-        Tensor tAsA            = gmem_thr_copy_A.partition_D(sA);                                  // (VEC,ACPY_M,ACPY_K,PIPE)
+        auto             gmem_thr_copy_A = gmem_tiled_copy_A.get_slice(threadIdx.x);
+        Tensor           tAgA = gmem_thr_copy_A.partition_S(gA); // (VEC,ACPY_M,ACPY_K,k')
+        Tensor           tAsA = gmem_thr_copy_A.partition_D(sA); // (VEC,ACPY_M,ACPY_K,PIPE)
 
         GmemTiledCopyAct gmem_tiled_copy_B;
-        auto gmem_thr_copy_B   = gmem_tiled_copy_B.get_slice(threadIdx.x);
-        Tensor tBgB            = gmem_thr_copy_B.partition_S(gB);                                  // (VEC,ACPY_N,ACPY_K,_1)
-        Tensor tBsB            = gmem_thr_copy_B.partition_D(sB);                                  // (VEC,ACPY_N,ACPY_K,PIPE)
+        auto             gmem_thr_copy_B = gmem_tiled_copy_B.get_slice(threadIdx.x);
+        Tensor           tBgB = gmem_thr_copy_B.partition_S(gB); // (VEC,ACPY_N,ACPY_K,_1)
+        Tensor           tBsB = gmem_thr_copy_B.partition_D(sB); // (VEC,ACPY_N,ACPY_K,PIPE)
 
         // Copy and MMA partitioning
-        Tensor tCrA            = thr_mma.partition_fragment_A(sA(_,_,Int<0>{}));                   // (VEC,MMA_M,MMA_K)
-        Tensor tCrB            = thr_mma.partition_fragment_B(sB(_,_,Int<0>{}));                   // (VEC,MMA_N,MMA_K)
+        Tensor tCrA = thr_mma.partition_fragment_A(sA(_, _, Int<0>{})); // (VEC,MMA_M,MMA_K)
+        Tensor tCrB = thr_mma.partition_fragment_B(sB(_, _, Int<0>{})); // (VEC,MMA_N,MMA_K)
 
-        auto smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomFlt{}, tiled_mma);
-        auto smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(threadIdx.x);
-        Tensor tCsA            = smem_thr_copy_A.partition_S(sA);                                  // (VEC,CPY_M,CPY_K,PIPE)
-        Tensor tCrA_copy_view  = smem_thr_copy_A.retile_D(tCrA);
+        auto   smem_tiled_copy_A = make_tiled_copy_A(SmemCopyAtomFlt{}, tiled_mma);
+        auto   smem_thr_copy_A   = smem_tiled_copy_A.get_thread_slice(threadIdx.x);
+        Tensor tCsA              = smem_thr_copy_A.partition_S(sA); // (VEC,CPY_M,CPY_K,PIPE)
+        Tensor tCrA_copy_view    = smem_thr_copy_A.retile_D(tCrA);
 
-        auto smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomAct{}, tiled_mma);
-        auto smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(threadIdx.x);
-        Tensor tCsB            = smem_thr_copy_B.partition_S(sB);                                  // (VEC,CPY_N,CPY_K,PIPE)
-        Tensor tCrB_copy_view  = smem_thr_copy_B.retile_D(tCrB);
+        auto   smem_tiled_copy_B = make_tiled_copy_B(SmemCopyAtomAct{}, tiled_mma);
+        auto   smem_thr_copy_B   = smem_tiled_copy_B.get_thread_slice(threadIdx.x);
+        Tensor tCsB              = smem_thr_copy_B.partition_S(sB); // (VEC,CPY_N,CPY_K,PIPE)
+        Tensor tCrB_copy_view    = smem_thr_copy_B.retile_D(tCrB);
 
         //
         // Prologue
         //
         int k_tile_count = size<2>(gA);
-        // XXX: should be multimode (C/TILER_C, (S,R,T)) but I am going to shave off a few k clks and flatten this for now
-        auto k_tile_iter = cute::make_coord_iterator(size<2>(gA));
+        // XXX: should be multimode (C/TILER_C, (S,R,T)) but I am going to shave off a few k clks
+        // and flatten this for now
+        auto          k_tile_iter = cute::make_coord_iterator(size<2>(gA));
         constexpr int K_BLOCK_MAX = size<2>(tCrA); // Size of the register pipeline
 
         static_assert(Stages >= 2);
         static_assert(K_BLOCK_MAX > 1);
 
         // Current pipe index in smem for mma to read from
-        int smem_pipe_read  = 0;
+        int smem_pipe_read = 0;
         // Current pipe index in smem for gmem to write to
         int smem_pipe_write = 0;
 
         // ramp up the gmem->smem load pipeline
         CUTE_UNROLL
-        for (; smem_pipe_write < Stages-1; ++smem_pipe_write) {
-            copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
-            copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+        for (; smem_pipe_write < Stages - 1; ++smem_pipe_write) {
+            copy(gmem_tiled_copy_A, tAgA(_, _, _, *k_tile_iter), tAsA(_, _, _, smem_pipe_write));
+            copy(gmem_tiled_copy_B, tBgB(_, _, _, *k_tile_iter), tBsB(_, _, _, smem_pipe_write));
             cp_async_fence();
             --k_tile_count;
-            if (k_tile_count > 0) { ++k_tile_iter; }
+            if (k_tile_count > 0) {
+                ++k_tile_iter;
+            }
         }
 
         // ramp up the smem->rmem load pipeline
@@ -422,10 +404,10 @@ struct KernelFunctorV1 {
         // uint64_t prologue_gmem_2_smem = clock64();
 
         // Prefetch the first rmem fragment for filter
-        Tensor tCsA_read = tCsA(_,_,_,smem_pipe_read);
-        copy(smem_tiled_copy_A, tCsA_read(_,_,Int<0>{}), tCrA_copy_view(_,_,Int<0>{}));
-        Tensor tCsB_read = tCsB(_,_,_,smem_pipe_read);
-        copy(smem_tiled_copy_B, tCsB_read(_,_,Int<0>{}), tCrB_copy_view(_,_,Int<0>{}));
+        Tensor tCsA_read = tCsA(_, _, _, smem_pipe_read);
+        copy(smem_tiled_copy_A, tCsA_read(_, _, Int<0>{}), tCrA_copy_view(_, _, Int<0>{}));
+        Tensor tCsB_read = tCsB(_, _, _, smem_pipe_read);
+        copy(smem_tiled_copy_B, tCsB_read(_, _, Int<0>{}), tCrB_copy_view(_, _, Int<0>{}));
 
         //
         // Mainloop
@@ -434,9 +416,9 @@ struct KernelFunctorV1 {
 
         // XXX: WARNING this loop does not support predication over any mode!
         CUTE_NO_UNROLL
-        for (; k_tile_count > -(Stages-1); --k_tile_count) { // trip count = S*R*T*(C / Tiler_C)
+        for (; k_tile_count > -(Stages - 1); --k_tile_count) { // trip count = S*R*T*(C / Tiler_C)
             // Pipeline the outer products with a static for loop.
-            for_each(make_int_sequence<K_BLOCK_MAX>{}, [&] (auto k_block) {
+            for_each(make_int_sequence<K_BLOCK_MAX>{}, [&](auto k_block) {
                 if (k_block == K_BLOCK_MAX - 1) {
                     // Commit the smem for smem_pipe_read
                     cp_async_wait<Stages - 2>();
@@ -444,27 +426,33 @@ struct KernelFunctorV1 {
                     // Advance the smem->rmem pipeline
                     ++smem_pipe_read;
                     smem_pipe_read = (smem_pipe_read == Stages) ? 0 : smem_pipe_read;
-                    tCsA_read = tCsA(_,_,_,smem_pipe_read);
-                    tCsB_read = tCsB(_,_,_,smem_pipe_read);
+                    tCsA_read      = tCsA(_, _, _, smem_pipe_read);
+                    tCsB_read      = tCsB(_, _, _, smem_pipe_read);
                 }
 
                 // Load smem->rmem for k_block+1
                 constexpr auto k_block_next = (k_block + Int<1>{}) % K_BLOCK_MAX; // static
-                copy(smem_tiled_copy_A, tCsA_read(_,_,k_block_next), tCrA_copy_view(_,_,k_block_next));
-                copy(smem_tiled_copy_B, tCsB_read(_,_,k_block_next), tCrB_copy_view(_,_,k_block_next));
+                copy(smem_tiled_copy_A, tCsA_read(_, _, k_block_next),
+                     tCrA_copy_view(_, _, k_block_next));
+                copy(smem_tiled_copy_B, tCsB_read(_, _, k_block_next),
+                     tCrB_copy_view(_, _, k_block_next));
 
                 // Copy gmem to smem before computing gemm on each k-pipe
                 if (k_block == 0) {
-                    copy(gmem_tiled_copy_A, tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,smem_pipe_write));
-                    copy(gmem_tiled_copy_B, tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,smem_pipe_write));
+                    copy(gmem_tiled_copy_A, tAgA(_, _, _, *k_tile_iter),
+                         tAsA(_, _, _, smem_pipe_write));
+                    copy(gmem_tiled_copy_B, tBgB(_, _, _, *k_tile_iter),
+                         tBsB(_, _, _, smem_pipe_write));
                     cp_async_fence();
                     // Advance the gmem->smem pipeline
-                    if (k_tile_count > 0) { ++k_tile_iter; }
+                    if (k_tile_count > 0) {
+                        ++k_tile_iter;
+                    }
                     smem_pipe_write = smem_pipe_read;
                 }
 
                 // gemm for k_block
-                cute::gemm(tiled_mma, accum, tCrA(_,_,k_block), tCrB(_,_,k_block), accum);
+                cute::gemm(tiled_mma, accum, tCrA(_, _, k_block), tCrB(_, _, k_block), accum);
             });
         }
 
@@ -477,41 +465,39 @@ struct KernelFunctorV1 {
         // uint64_t epi_start = clock64();
 
         auto smem_tiled_copy_C = make_tiled_copy_C(SmemCopyAtomOut{}, tiled_mma);
-        auto smem_thr_copy_C = smem_tiled_copy_C.get_slice(threadIdx.x);
-        auto tCrC = smem_thr_copy_C.retile_S(accum);
-        auto tCsC = smem_thr_copy_C.partition_D(sC);
+        auto smem_thr_copy_C   = smem_tiled_copy_C.get_slice(threadIdx.x);
+        auto tCrC              = smem_thr_copy_C.retile_S(accum);
+        auto tCsC              = smem_thr_copy_C.partition_D(sC);
         copy(smem_tiled_copy_C, tCrC, tCsC);
 
         __syncthreads();
 
         GmemTiledCopyOut gmem_tiled_copy_C;
-        auto gmem_thr_copy_C = gmem_tiled_copy_C.get_slice(threadIdx.x);
-        auto tDsC = gmem_thr_copy_C.partition_S(sC);
-        auto tDgC = gmem_thr_copy_C.partition_D(gC);
+        auto             gmem_thr_copy_C = gmem_tiled_copy_C.get_slice(threadIdx.x);
+        auto             tDsC            = gmem_thr_copy_C.partition_S(sC);
+        auto             tDgC            = gmem_thr_copy_C.partition_D(gC);
         copy(gmem_tiled_copy_C, tDsC, tDgC);
 
         // uint64_t end = clock64();
     }
 };
 
-template<class Operator, class FilterTensor, class ActivationTensor, class OutputTensor>
+template <class Operator, class FilterTensor, class ActivationTensor, class OutputTensor>
 __global__
-__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
-void kernel_entrypoint(FilterTensor mFlt, ActivationTensor mAct, OutputTensor mOut) {
+__launch_bounds__(
+    Operator::MaxThreadsPerBlock,
+    Operator::MinBlocksPerMultiprocessor) void kernel_entrypoint(FilterTensor     mFlt,
+                                                                 ActivationTensor mAct,
+                                                                 OutputTensor     mOut) {
     extern __shared__ char smem_buf[];
-    Operator op;
+    Operator               op;
     op(mFlt, mAct, mOut, smem_buf);
 }
 
 template <int Di, int Do>
-int stencilConvolveLauncher(
-        size_t num_bricks,
-        uint32_t *halo_index_buffer,
-        float *inputBuffer,
-        float *stencil,
-        float *outputBuffer,
-        uint32_t *output_index_buffer) {
-
+int
+stencilConvolveLauncher(size_t num_bricks, uint32_t *halo_index_buffer, float *inputBuffer,
+                        float *stencil, float *outputBuffer, uint32_t *output_index_buffer) {
     using KernelFunctor = KernelFunctorV1<cute::Int<Di>, cute::Int<Do>>;
 
     using D_t = typename KernelFunctor::D;
@@ -528,7 +514,9 @@ int stencilConvolveLauncher(
 
     int N = num_bricks; // dynamic
     if (N % int(typename KernelFunctor::Tiler_N{}) != 0) {
-        printf("ERROR: Input image count must be evenly divisible by CTA tiler N. Got num_bricks = %d\n", N);
+        printf(
+            "ERROR: Input image count must be evenly divisible by CTA tiler N. Got num_bricks = %d\n",
+            N);
         return 1;
     }
 
@@ -549,64 +537,65 @@ int stencilConvolveLauncher(
     typename KernelFunctor::GmemLayoutFlt filter_layout{};
 
     // Tensor Output    : (n,z,p,q,k)::(?,4,2,2,128):(2048,1024,512,128,1)
-    auto output_layout = make_ordered_layout(
-        make_shape( K,   make_shape( N,   Z,   P,   Q)),
-        make_tuple(_0{}, make_tuple(_4{},_3{},_2{},_1{})));
+    auto output_layout = make_ordered_layout(make_shape(K, make_shape(N, Z, P, Q)),
+                                             make_tuple(_0{}, make_tuple(_4{}, _3{}, _2{}, _1{})));
 
     // Input gather layout
-    auto xformed_act_layout = make_layout(
-        make_shape (make_shape(N, Z, P, Q), make_shape (C, T, R, S)),
-        make_stride(make_stride(C*D*H*W, H*W*C, C*W, C), make_stride(_1{}, H*W*C, C*W, C))); // -> idx
+    auto xformed_act_layout =
+        make_layout(make_shape(make_shape(N, Z, P, Q), make_shape(C, T, R, S)),
+                    make_stride(make_stride(C * D * H * W, H * W * C, C * W, C),
+                                make_stride(_1{}, H * W * C, C * W, C))); // -> idx
 
     auto xformed_act_shape = shape(xformed_act_layout);
-    auto xformed_act_basis_stride = make_stride(
-        make_stride(W*H*D*E<0>{}, W*H*E<0>{}, W*E<0>{}, _1{}*E<0>{}),
-        make_stride(      E<1>{}, W*H*E<0>{}, W*E<0>{}, _1{}*E<0>{}));
+    auto xformed_act_basis_stride =
+        make_stride(make_stride(W * H * D * E<0>{}, W * H * E<0>{}, W * E<0>{}, _1{} * E<0>{}),
+                    make_stride(E<1>{}, W * H * E<0>{}, W * E<0>{}, _1{} * E<0>{}));
 
     // gather_tensor_layout(make_coord((nzpq), (csrt))) => (idx_buffer_idx, dense_c_idx)
     auto xformed_act_basis_layout = make_layout(xformed_act_shape, xformed_act_basis_stride);
     // gather_tensor_layout(make_coord(idx_buffer_idx, dense_c_idx)) => idx in input values buffer
-    auto xformed_act_gather_layout = make_layout(
-        make_shape(_1{},_1{}),
-        make_stride(CustomStride{IndexedGather{halo_index_buffer}, C}, _1{}));
+    auto xformed_act_gather_layout =
+        make_layout(make_shape(_1{}, _1{}),
+                    make_stride(CustomStride{ IndexedGather{ halo_index_buffer }, C }, _1{}));
 
     // Composed layout that takes the composes the idx buf index and c index to map to values
-    // ((nzpq), (csrt)) => (idx_buffer_idx, dense_c_idx) => (gmem_base_ptr + halo_index_buf[idx_buffer_idx])[dense_c_idx]
-    // XXX: CustomStride is scaling the loaded index value by the C dimension, so our offset vector should be unscaled
+    // ((nzpq), (csrt)) => (idx_buffer_idx, dense_c_idx) => (gmem_base_ptr +
+    // halo_index_buf[idx_buffer_idx])[dense_c_idx]
+    // XXX: CustomStride is scaling the loaded index value by the C dimension, so our offset vector
+    // should be unscaled
     auto xformed_act_composed_layout = composition(
-        xformed_act_gather_layout,
-        make_arithmetic_tuple(_0{}, _0{}),
-        xformed_act_basis_layout);
+        xformed_act_gather_layout, make_arithmetic_tuple(_0{}, _0{}), xformed_act_basis_layout);
 
     // Output scatter layout
-    auto out_basis_stride = make_stride(
-        E<1>{},
-        make_stride(Z*P*Q*E<0>{}, P*Q*E<0>{}, Q*E<0>{}, _1{}*E<0>{})); // -> (crd0, crd1)
+    auto out_basis_stride =
+        make_stride(E<1>{}, make_stride(Z * P * Q * E<0>{}, P * Q * E<0>{}, Q * E<0>{},
+                                        _1{} * E<0>{})); // -> (crd0, crd1)
     auto out_basis_layout = make_layout(shape(output_layout), out_basis_stride);
-    auto out_scatter_layout = make_layout(
-        make_shape(_1{},_1{}),
-        make_stride(CustomStride{IndexedGather{output_index_buffer}, K}, _1{}));
-    auto out_composed_layout = composition(
-        out_scatter_layout,
-        make_arithmetic_tuple(_0{},_0{}),
-        out_basis_layout);
-
-    cute::Tensor mXformedActGather = make_tensor(make_gmem_ptr(inputBuffer), xformed_act_composed_layout);
+    auto out_scatter_layout =
+        make_layout(make_shape(_1{}, _1{}),
+                    make_stride(CustomStride{ IndexedGather{ output_index_buffer }, K }, _1{}));
+    auto out_composed_layout =
+        composition(out_scatter_layout, make_arithmetic_tuple(_0{}, _0{}), out_basis_layout);
+
+    cute::Tensor mXformedActGather =
+        make_tensor(make_gmem_ptr(inputBuffer), xformed_act_composed_layout);
     cute::Tensor mFilter = make_tensor(make_gmem_ptr(stencil), filter_layout);
-    cute::Tensor mOutputScatter = make_tensor(make_gmem_ptr(outputBuffer), out_composed_layout);  // (K, (N,Z,P,Q))
+    cute::Tensor mOutputScatter =
+        make_tensor(make_gmem_ptr(outputBuffer), out_composed_layout); // (K, (N,Z,P,Q))
 
-    cute::Tensor gOutput_mn = zipped_divide(mOutputScatter, typename KernelFunctor::TilerOut{}); // ((BLK_M, BLK_N), (m', n'))
-    dim3 lauch_grid {size<1,1>(gOutput_mn), size<1,0>(gOutput_mn), 1};
+    cute::Tensor gOutput_mn = zipped_divide(
+        mOutputScatter, typename KernelFunctor::TilerOut{});           // ((BLK_M, BLK_N), (m', n'))
+    dim3             lauch_grid{ size<1, 1>(gOutput_mn), size<1, 0>(gOutput_mn), 1 };
     constexpr size_t smem_size = sizeof(typename KernelFunctor::SharedStorage);
 
-    #if 0
+#if 0
         print("xforemed gather layout ((N,Z,P,Q), (C,T,R,S)) = "); print(xformed_act_composed_layout); print("\n");
         print("Output          layout ( K,        (N,Z,P,Q)) = "); print(output_layout);               print("\n");
         print("Output  scatter layout ( K,        (N,Z,P,Q)) = "); print(out_composed_layout);         print("\n");
         print("Filter layout          ( K,        (C,T,R,S)) = "); print(filter_layout);               print("\n");
         print("Tiled Output layout = "); print(gOutput_mn.layout());                                   print("\n");
-    #endif
-    #if 0
+#endif
+#if 0
         for (int n = 0; n < 1; ++n)
         for (int z = 0; z < size<0,1>(mXformedActGather); ++z)
         for (int p = 0; p < size<0,2>(mXformedActGather); ++p)
@@ -635,43 +624,43 @@ int stencilConvolveLauncher(
                     print(" => input value "); print(val_at_idx); print("\n");
             }
         }
-    #endif
+#endif
 
-    cudaFuncSetAttribute(
-        kernel_entrypoint<KernelFunctor, decltype(mFilter), decltype(mXformedActGather), decltype(mOutputScatter)>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem_size);
+    cudaFuncSetAttribute(kernel_entrypoint<KernelFunctor, decltype(mFilter),
+                                           decltype(mXformedActGather), decltype(mOutputScatter)>,
+                         cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size);
 
     // cudaEvent_t start, stop;
     // cudaEventCreate(&start);
     // cudaEventCreate(&stop);
     // cudaEventRecord(start);
-    kernel_entrypoint<KernelFunctor, decltype(mFilter), decltype(mXformedActGather), decltype(mOutputScatter)>
-        <<<lauch_grid, KernelFunctor::MaxThreadsPerBlock, smem_size>>>(
-            mFilter, mXformedActGather, mOutputScatter);
+    kernel_entrypoint<KernelFunctor, decltype(mFilter), decltype(mXformedActGather),
+                      decltype(mOutputScatter)>
+        <<<lauch_grid, KernelFunctor::MaxThreadsPerBlock, smem_size>>>(mFilter, mXformedActGather,
+                                                                       mOutputScatter);
     // cudaEventRecord(stop);
     // cudaEventSynchronize(stop);
     // float milliseconds = 0;
     // cudaEventElapsedTime(&milliseconds, start, stop);
     // milliseconds /= 1;
 
-    #if 0
+#if 0
     double tflop_count = (2 * double(size<0>(xformed_act_shape)) * double(size(filter_layout))) / double(1e12);
     double tflops = tflop_count / (double(milliseconds) / double(1e3));
     printf("Conv TFLOP count = %f\n", tflop_count);
     printf("GPU convolution: %fms. TFLOP/s = %f\n", milliseconds, tflops);
-    #endif
+#endif
 
     return 0;
 }
 
-
 template <>
-torch::Tensor dispatchSparseConvolutionCutlass<torch::kCUDA>(
-        const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-        const torch::Tensor& haloIndexBuffer, const torch::Tensor& outputIndexBuffer,
-        bool benchmark) {
-
+torch::Tensor
+dispatchSparseConvolutionCutlass<torch::kCUDA>(const torch::Tensor &inFeatures,
+                                               const torch::Tensor &kernel,
+                                               const torch::Tensor &haloIndexBuffer,
+                                               const torch::Tensor &outputIndexBuffer,
+                                               bool                 benchmark) {
     // Assuming kernel is reshaped from [Do, Di, D, H, W] to [Do, D, H, W, Di] outside
     const int inC = kernel.size(4), outC = kernel.size(0);
 
@@ -679,63 +668,60 @@ torch::Tensor dispatchSparseConvolutionCutlass<torch::kCUDA>(
     torch::Tensor outFeatures;
 
     if (!benchmark) {
-      // Pre-pad input features with 0
-      paddedInFeatures = torch::zeros({inFeatures.size(0) + 1, inC}, inFeatures.options());
-      paddedInFeatures.slice(0, 1, inFeatures.size(0) + 1) = inFeatures;
-      outFeatures = torch::zeros({paddedInFeatures.size(0), outC}, inFeatures.options());
+        // Pre-pad input features with 0
+        paddedInFeatures = torch::zeros({ inFeatures.size(0) + 1, inC }, inFeatures.options());
+        paddedInFeatures.slice(0, 1, inFeatures.size(0) + 1) = inFeatures;
+        outFeatures = torch::zeros({ paddedInFeatures.size(0), outC }, inFeatures.options());
     } else {
-      paddedInFeatures = inFeatures;
-      outFeatures = torch::empty({inFeatures.size(0), outC}, inFeatures.options());
+        paddedInFeatures = inFeatures;
+        outFeatures      = torch::empty({ inFeatures.size(0), outC }, inFeatures.options());
     }
 
     // Run convolution
     auto convFunc = stencilConvolveLauncher<64, 128>;
 
     if (inC == 32 && outC == 64) {
-      convFunc = stencilConvolveLauncher<32, 64>;
+        convFunc = stencilConvolveLauncher<32, 64>;
     } else if (inC == 64 && outC == 128) {
-      convFunc = stencilConvolveLauncher<64, 128>;
+        convFunc = stencilConvolveLauncher<64, 128>;
     } else if (inC == 128 && outC == 256) {
-      convFunc = stencilConvolveLauncher<128, 256>;
+        convFunc = stencilConvolveLauncher<128, 256>;
     } else if (inC == 32 && outC == 32) {
-      convFunc = stencilConvolveLauncher<32, 32>;
+        convFunc = stencilConvolveLauncher<32, 32>;
     } else if (inC == 64 && outC == 64) {
-      convFunc = stencilConvolveLauncher<64, 64>;
+        convFunc = stencilConvolveLauncher<64, 64>;
     } else if (inC == 128 && outC == 128) {
-      convFunc = stencilConvolveLauncher<128, 128>;
+        convFunc = stencilConvolveLauncher<128, 128>;
     } else if (inC == 128 && outC == 64) {
-      convFunc = stencilConvolveLauncher<128, 64>;
+        convFunc = stencilConvolveLauncher<128, 64>;
     } else if (inC == 256 && outC == 256) {
-      convFunc = stencilConvolveLauncher<256, 256>;
+        convFunc = stencilConvolveLauncher<256, 256>;
     } else if (inC == 256 && outC == 128) {
-      convFunc = stencilConvolveLauncher<256, 128>;
+        convFunc = stencilConvolveLauncher<256, 128>;
     } else if (inC == 64 && outC == 32) {
-      convFunc = stencilConvolveLauncher<64, 32>;
+        convFunc = stencilConvolveLauncher<64, 32>;
     } else if (inC == 384 && outC == 256) {
-      convFunc = stencilConvolveLauncher<384, 256>;
+        convFunc = stencilConvolveLauncher<384, 256>;
     } else if (inC == 192 && outC == 128) {
-      convFunc = stencilConvolveLauncher<192, 128>;
+        convFunc = stencilConvolveLauncher<192, 128>;
     } else if (inC == 512 && outC == 512) {
-      convFunc = stencilConvolveLauncher<512, 512>;
+        convFunc = stencilConvolveLauncher<512, 512>;
     } else if (inC == 512 && outC == 256) {
-      convFunc = stencilConvolveLauncher<512, 256>;
+        convFunc = stencilConvolveLauncher<512, 256>;
     } else if (inC == 256 && outC == 512) {
-      convFunc = stencilConvolveLauncher<256, 512>;
+        convFunc = stencilConvolveLauncher<256, 512>;
     } else {
-      TORCH_CHECK(false, "Unsupported convolution size, inC = " + std::to_string(inC) + ", outC = " + std::to_string(outC));
+        TORCH_CHECK(false, "Unsupported convolution size, inC = " + std::to_string(inC) +
+                               ", outC = " + std::to_string(outC));
     }
 
-    convFunc(
-        haloIndexBuffer.size(0) / 96,
-        (uint32_t *) haloIndexBuffer.data_ptr<int>(),
-        paddedInFeatures.data_ptr<float>(),
-        kernel.data_ptr<float>(),
-        outFeatures.data_ptr<float>(),
-        (uint32_t *) outputIndexBuffer.data_ptr<int>());
+    convFunc(haloIndexBuffer.size(0) / 96, (uint32_t *)haloIndexBuffer.data_ptr<int>(),
+             paddedInFeatures.data_ptr<float>(), kernel.data_ptr<float>(),
+             outFeatures.data_ptr<float>(), (uint32_t *)outputIndexBuffer.data_ptr<int>());
 
     if (!benchmark) {
-      // Slice out the padded part
-      outFeatures = outFeatures.slice(0, 1, inFeatures.size(0) + 1);
+        // Slice out the padded part
+        outFeatures = outFeatures.slice(0, 1, inFeatures.size(0) + 1);
     }
 
     return outFeatures;
@@ -744,27 +730,27 @@ torch::Tensor dispatchSparseConvolutionCutlass<torch::kCUDA>(
 #else
 
 template <>
-torch::Tensor dispatchSparseConvolutionCutlass<torch::kCUDA>(
-        const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-        const torch::Tensor& haloIndexBuffer, const torch::Tensor& outputIndexBuffer,
-        bool benchmark) {
-
+torch::Tensor
+dispatchSparseConvolutionCutlass<torch::kCUDA>(const torch::Tensor &inFeatures,
+                                               const torch::Tensor &kernel,
+                                               const torch::Tensor &haloIndexBuffer,
+                                               const torch::Tensor &outputIndexBuffer,
+                                               bool                 benchmark) {
     TORCH_CHECK(false, "CUDA <= 12.0 does not support c++20 standard. Compile with a newer nvcc.");
 }
 
 #endif
 
-
 template <>
-torch::Tensor dispatchSparseConvolutionCutlass<torch::kCPU>(
-        const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-        const torch::Tensor& haloIndexBuffer, const torch::Tensor& outputIndexBuffer,
-        bool benchmark) {
+torch::Tensor
+dispatchSparseConvolutionCutlass<torch::kCPU>(const torch::Tensor &inFeatures,
+                                              const torch::Tensor &kernel,
+                                              const torch::Tensor &haloIndexBuffer,
+                                              const torch::Tensor &outputIndexBuffer,
+                                              bool                 benchmark) {
     TORCH_CHECK(false, "CPU not supported for SparseConvolutionHalo yet!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
index 799e72f8fc..46c8f9b2e4 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHalo.cu
@@ -1,12 +1,13 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
+#include "ConvOps.h"
+
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <mma.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include <THC/THCAtomics.cuh>
 
 #define COALESCED_MEMORY_ACCESS_VARIANT
 
@@ -15,21 +16,20 @@ namespace detail {
 namespace ops {
 
 template <typename GridType>
-__global__ __launch_bounds__(1024)     // Hinting maximum threads per block during launch to optimize register usage.
-void stencilConvHaloKernel(int kM, int kN, int numLeaves,
-                           BatchGridAccessor<GridType> gridAcc,
-                           TorchRAcc64<float, 2> inFeatures,
-                           TorchRAcc64<float, 7> stencil,
-                           TorchRAcc64<float, 2> outFeatures) {
-
+__global__
+__launch_bounds__(
+    1024) // Hinting maximum threads per block during launch to optimize register usage.
+    void stencilConvHaloKernel(int kM, int kN, int numLeaves, BatchGridAccessor<GridType> gridAcc,
+                               TorchRAcc64<float, 2> inFeatures, TorchRAcc64<float, 7> stencil,
+                               TorchRAcc64<float, 2> outFeatures) {
 // While 700 (Volta) already supports TensorCore, it does not support TF32.
 // 800 (Ampere) supports both TensorCore and TF32.
 #if __CUDA_ARCH__ >= 800
 
-    int tid = threadIdx.x;
+    int tid     = threadIdx.x;
     int leafIdx = blockIdx.x % numLeaves;
-    int nIdx = (blockIdx.x / numLeaves) % kN;
-    int mIdx = blockIdx.x / numLeaves / kN;
+    int nIdx    = (blockIdx.x / numLeaves) % kN;
+    int mIdx    = blockIdx.x / numLeaves / kN;
 
     using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
 
@@ -37,46 +37,46 @@ void stencilConvHaloKernel(int kM, int kN, int numLeaves,
     static constexpr int M = 16;
     static constexpr int N = 16;
     static constexpr int K = 8;
-    using MatrixAType = float (&)[M][K];
-    using MatrixBType = float (&)[K][N];
-    using MatrixCType = float (&)[M][N];
+    using MatrixAType      = float(&)[M][K];
+    using MatrixBType      = float(&)[K][N];
+    using MatrixCType      = float(&)[M][N];
 
     // Constants: Input and output dimension multiples
     static constexpr int Di = 8;
     static constexpr int Do = 16;
 
-    using HaloBufferType = float (&)[10][10][10][Di];
-    using DenseOutputBufferType = float (&)[8][8][8][Do];
+    using HaloBufferType        = float(&)[10][10][10][Di];
+    using DenseOutputBufferType = float(&)[8][8][8][Do];
 
-    const int64_t batchIdx = gridAcc.leafBatchIndex(leafIdx);
+    const int64_t batchIdx     = gridAcc.leafBatchIndex(leafIdx);
     const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
-    const int64_t baseOffset = gridAcc.voxelOffset(batchIdx);
+    const int64_t baseOffset   = gridAcc.voxelOffset(batchIdx);
 
-    const nanovdb::NanoGrid<GridType>* deviceGrid = gridAcc.grid(batchIdx);
-    const LeafNodeType& leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
+    const nanovdb::NanoGrid<GridType> *deviceGrid = gridAcc.grid(batchIdx);
+    const LeafNodeType  &leaf   = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
     const nanovdb::Coord origin = leaf.origin();
-    auto deviceGridAcc = deviceGrid->getAccessor();
+    auto                 deviceGridAcc = deviceGrid->getAccessor();
 
     // Shared memory buffer (re-used by both 10x10x10 of size Di, or 8x8x8 of size Do)
-    __shared__ float sBufferRaw [8192];
-    HaloBufferType sHaloBuffer = reinterpret_cast<HaloBufferType>(sBufferRaw[0]);
+    __shared__ float sBufferRaw[8192];
+    HaloBufferType   sHaloBuffer = reinterpret_cast<HaloBufferType>(sBufferRaw[0]);
 
     // Dense gathering of input features 10x10x10 = 1000
     if (tid < 1000) {
-        int di = ((tid/100) % 10) - 1;
-        int dj = ((tid/10) % 10) - 1;
+        int di = ((tid / 100) % 10) - 1;
+        int dj = ((tid / 10) % 10) - 1;
         int dk = (tid % 10) - 1;
 
-        auto coord = origin.offsetBy(di,dj,dk);
+        auto coord = origin.offsetBy(di, dj, dk);
 
         if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
             auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
             for (int s = 0; s < Di; s++) {
                 int tDim = s + mIdx * Di;
-                sHaloBuffer[0][0][tid][s] = tDim < inFeatures.size(1) ? inFeatures[offset][tDim] : 0.0f;
+                sHaloBuffer[0][0][tid][s] =
+                    tDim < inFeatures.size(1) ? inFeatures[offset][tDim] : 0.0f;
             }
-        }
-        else {
+        } else {
             for (int s = 0; s < Di; s++)
                 sHaloBuffer[0][0][tid][s] = 0.0f;
         }
@@ -85,11 +85,11 @@ void stencilConvHaloKernel(int kM, int kN, int numLeaves,
     __syncthreads();
 
     // Spoke tensor is a 8x8x8 subset of the 10x10x10 halo buffer
-    __shared__ float spokeStencil [8][8][8][Di];
+    __shared__ float spokeStencil[8][8][8][Di];
 
     // Depulicated k dimension
-    int i  = (threadIdx.x >> 7) & 0x7;
-    int J  = (threadIdx.x >> 4) & 0x6; // rounded down to "even" j
+    int i = (threadIdx.x >> 7) & 0x7;
+    int J = (threadIdx.x >> 4) & 0x6;  // rounded down to "even" j
 #ifndef COALESCED_MEMORY_ACCESS_VARIANT
     int jj = (threadIdx.x >> 4) & 0x1; // remainder; j = J + jj
     int k  = (threadIdx.x >> 1) & 0x7;
@@ -101,53 +101,62 @@ void stencilConvHaloKernel(int kM, int kN, int numLeaves,
     nvcuda::wmma::fill_fragment(c_frag, 0.0f);
 
     // For all kernel positions, add result to C.
-    //  Note: each 2-thread (totalling 512, achieved by k) is responsible for each location in the output.
+    //  Note: each 2-thread (totalling 512, achieved by k) is responsible for each location in the
+    //  output.
     //   each of the 32 [16]x2-threads collaborate within to do the matrix multiplication.
     for (int di = 0; di <= 2; di++)
-    for (int dj = 0; dj <= 2; dj++)
-    for (int dk = 0; dk <= 2; dk++)
-    {
-
+        for (int dj = 0; dj <= 2; dj++)
+            for (int dk = 0; dk <= 2; dk++) {
 #ifdef COALESCED_MEMORY_ACCESS_VARIANT
-        // Coalesced memory access pattern
-        spokeStencil[i][J  ][0][threadWarpID] = sHaloBuffer[i+di][J+dj  ][dk  ][threadWarpID]; // jj = 0; k = 0
-        spokeStencil[i][J  ][4][threadWarpID] = sHaloBuffer[i+di][J+dj  ][dk+4][threadWarpID]; // jj = 0; k = 4
-        spokeStencil[i][J+1][0][threadWarpID] = sHaloBuffer[i+di][J+dj+1][dk  ][threadWarpID]; // jj = 1; k = 0
-        spokeStencil[i][J+1][4][threadWarpID] = sHaloBuffer[i+di][J+dj+1][dk+4][threadWarpID]; // jj = 1; k = 4
+                // Coalesced memory access pattern
+                spokeStencil[i][J][0][threadWarpID] =
+                    sHaloBuffer[i + di][J + dj][dk][threadWarpID];         // jj = 0; k = 0
+                spokeStencil[i][J][4][threadWarpID] =
+                    sHaloBuffer[i + di][J + dj][dk + 4][threadWarpID];     // jj = 0; k = 4
+                spokeStencil[i][J + 1][0][threadWarpID] =
+                    sHaloBuffer[i + di][J + dj + 1][dk][threadWarpID];     // jj = 1; k = 0
+                spokeStencil[i][J + 1][4][threadWarpID] =
+                    sHaloBuffer[i + di][J + dj + 1][dk + 4][threadWarpID]; // jj = 1; k = 4
 #else
-        // Reference version
-        for (int inDim = 0; inDim < Di; inDim++)
-            spokeStencil[i][J+jj][k][inDim] = mHaloBuffer[i+di][J+dj+jj][dk+k][inDim];
-        __syncthreads();
+                // Reference version
+                for (int inDim = 0; inDim < Di; inDim++)
+                    spokeStencil[i][J + jj][k][inDim] =
+                        mHaloBuffer[i + di][J + dj + jj][dk + k][inDim];
+                __syncthreads();
 #endif
 
-        // Declare the fragments
-        nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, M, N, K, nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major> a_frag;
-        nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, M, N, K, nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major> b_frag;
+                // Declare the fragments
+                nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, M, N, K,
+                                       nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>
+                    a_frag;
+                nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, M, N, K,
+                                       nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>
+                    b_frag;
 
-        MatrixAType matrixA = reinterpret_cast<MatrixAType>(spokeStencil[i][J][0][0]);
-        MatrixBType matrixB = reinterpret_cast<MatrixBType>(stencil[di][dj][dk][mIdx][nIdx][0][0]);
+                MatrixAType matrixA = reinterpret_cast<MatrixAType>(spokeStencil[i][J][0][0]);
+                MatrixBType matrixB =
+                    reinterpret_cast<MatrixBType>(stencil[di][dj][dk][mIdx][nIdx][0][0]);
 
-        nvcuda::wmma::load_matrix_sync(a_frag, &matrixA[0][0], K);
-        nvcuda::wmma::load_matrix_sync(b_frag, &matrixB[0][0], N); // b is row-major, hence the stride
+                nvcuda::wmma::load_matrix_sync(a_frag, &matrixA[0][0], K);
+                nvcuda::wmma::load_matrix_sync(b_frag, &matrixB[0][0],
+                                               N); // b is row-major, hence the stride
 
 #if 1
 #pragma unroll
-        for (int t = 0; t < a_frag.num_elements; t++)
-            a_frag.x[t] =  nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
+                for (int t = 0; t < a_frag.num_elements; t++)
+                    a_frag.x[t] = nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
 
 #pragma unroll
-        for (int t = 0; t < b_frag.num_elements; t++)
-            b_frag.x[t] =  nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
+                for (int t = 0; t < b_frag.num_elements; t++)
+                    b_frag.x[t] = nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
 #endif
 
-        // Perform the matrix multiplication using TensorCore (tf32) and accumulation
-        nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-
-    }
+                // Perform the matrix multiplication using TensorCore (tf32) and accumulation
+                nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+            }
 
     DenseOutputBufferType sOutputBuffer = reinterpret_cast<DenseOutputBufferType>(sBufferRaw[0]);
-    MatrixCType matrixC = reinterpret_cast<MatrixCType>(sOutputBuffer[i][J][0][0]);
+    MatrixCType           matrixC       = reinterpret_cast<MatrixCType>(sOutputBuffer[i][J][0][0]);
     __syncthreads();
 
     nvcuda::wmma::store_matrix_sync(&matrixC[0][0], c_frag, N, nvcuda::wmma::mem_row_major);
@@ -155,8 +164,8 @@ void stencilConvHaloKernel(int kM, int kN, int numLeaves,
 
     // Sparse commit to 8x8x8 output.
     if (threadIdx.x < 512) {
-        int ti = (threadIdx.x >> 6 ) & 0x7;
-        int tj = (threadIdx.x >> 3 ) & 0x7;
+        int ti = (threadIdx.x >> 6) & 0x7;
+        int tj = (threadIdx.x >> 3) & 0x7;
         int tk = threadIdx.x & 0x7;
 
         auto coord = origin.offsetBy(ti, tj, tk);
@@ -167,35 +176,33 @@ void stencilConvHaloKernel(int kM, int kN, int numLeaves,
                 int tDim = s + nIdx * Do;
                 if (tDim < outFeatures.size(1))
                     gpuAtomicAddNoReturn(&outFeatures[offset][tDim], sOutputBuffer[ti][tj][tk][s]);
-                    // outFeatures[offset][tDim] += sOutputBuffer[ti][tj][tk][s];
+                // outFeatures[offset][tDim] += sOutputBuffer[ti][tj][tk][s];
             }
         }
     }
 
 #endif // __CUDA_ARCH__ >= 800
-
 }
 
 template <typename GridType>
-__global__ __launch_bounds__(256)
-void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
-                                     BatchGridAccessor<GridType> gridAcc,
-                                     TorchRAcc64<float, 2> inFeatures,
-                                     TorchRAcc64<float, 7> stencil,
-                                     TorchRAcc64<float, 2> outFeatures) {
-
+__global__
+__launch_bounds__(256) void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
+                                                            BatchGridAccessor<GridType> gridAcc,
+                                                            TorchRAcc64<float, 2>       inFeatures,
+                                                            TorchRAcc64<float, 7>       stencil,
+                                                            TorchRAcc64<float, 2> outFeatures) {
 // While 700 (Volta) already supports TensorCore, it does not support TF32.
 // 800 (Ampere) supports both TensorCore and TF32.
 #if __CUDA_ARCH__ >= 800
 
-    const int tid = threadIdx.x;
-    const int Bk = (blockIdx.x & 0x1) << 2;
-    const int Bj = ((blockIdx.x >> 1) & 0x3) << 1;
-    const int Bi = ((blockIdx.x >> 3) & 0x3) << 1;
+    const int tid     = threadIdx.x;
+    const int Bk      = (blockIdx.x & 0x1) << 2;
+    const int Bj      = ((blockIdx.x >> 1) & 0x3) << 1;
+    const int Bi      = ((blockIdx.x >> 3) & 0x3) << 1;
     const int restIdx = blockIdx.x >> 5;
     const int leafIdx = restIdx % numLeaves;
-    const int nIdx = (restIdx / numLeaves) % kN;
-    const int mIdx = restIdx / numLeaves / kN;
+    const int nIdx    = (restIdx / numLeaves) % kN;
+    const int mIdx    = restIdx / numLeaves / kN;
 
     using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
 
@@ -203,18 +210,18 @@ void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
     static constexpr int Di = 64;
     static constexpr int Do = 128;
 
-    const int64_t batchIdx = gridAcc.leafBatchIndex(leafIdx);
+    const int64_t batchIdx     = gridAcc.leafBatchIndex(leafIdx);
     const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
-    const int64_t baseOffset = gridAcc.voxelOffset(batchIdx);
+    const int64_t baseOffset   = gridAcc.voxelOffset(batchIdx);
 
-    const nanovdb::NanoGrid<GridType>* deviceGrid = gridAcc.grid(batchIdx);
-    const LeafNodeType& leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
+    const nanovdb::NanoGrid<GridType> *deviceGrid = gridAcc.grid(batchIdx);
+    const LeafNodeType  &leaf   = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
     const nanovdb::Coord origin = leaf.origin();
-    auto deviceGridAcc = deviceGrid->getAccessor();
+    auto                 deviceGridAcc = deviceGrid->getAccessor();
 
     // Check if the current brick is active (this will ignore disabling status)
-    const auto& valueMask = leaf.valueMask();
-    uint64_t activeMask = valueMask.words()[Bi] | valueMask.words()[Bi + 1];    // 8x8 slice
+    const auto &valueMask  = leaf.valueMask();
+    uint64_t    activeMask = valueMask.words()[Bi] | valueMask.words()[Bi + 1]; // 8x8 slice
     activeMask &= (0xffffUL << (Bj << 3));
     activeMask &= (0xf0f0f0f0f0f0f0fUL << Bk);
     if (!activeMask)
@@ -226,17 +233,17 @@ void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
     __shared__ float sOutputBuffer[2][2][4][Do];
 
     // Gathering data from input features (collectively)
-    const int II = (tid >> 6) & 0x3;    // First 2 bits
-    const int E = tid & 0x3f;           // Last 6 bits = 64 input channels
-    int tDim = E + mIdx * Di;
+    const int II   = (tid >> 6) & 0x3; // First 2 bits
+    const int E    = tid & 0x3f;       // Last 6 bits = 64 input channels
+    int       tDim = E + mIdx * Di;
     for (int jj = 0; jj < 4; ++jj) {
         for (int kk = 0; kk < 6; ++kk) {
             auto coord = origin.offsetBy(Bi + II - 1, Bj + jj - 1, Bk + kk - 1);
-            if (tDim < inFeatures.size(1) && deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
-                auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
+            if (tDim < inFeatures.size(1) &&
+                deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
+                auto offset                = deviceGridAcc.getValue(coord) - 1 + baseOffset;
                 sHaloBuffer[II][jj][kk][E] = inFeatures[offset][tDim];
-            }
-            else {
+            } else {
                 sHaloBuffer[II][jj][kk][E] = 0.0f;
             }
         }
@@ -245,9 +252,10 @@ void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
 
     // Preparation of GEMM
     using a_frag_t = nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 8,
-                        nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>;
-    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 8,
-                        nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major> b_frag;
+                                            nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>;
+    nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 8, nvcuda::wmma::precision::tf32,
+                           nvcuda::wmma::row_major>
+                                                                        b_frag;
     nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 8, float> c_frag;
     nvcuda::wmma::fill_fragment(c_frag, 0.0f);
 
@@ -259,55 +267,61 @@ void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
     __shared__ float sFragBuffer[8][32][4];
 
     for (int di = 0; di <= 2; di++)
-    for (int dj = 0; dj <= 2; dj++)
-    for (int dk = 0; dk <= 2; dk++)
-    {
-        // Copy data from Halo to the spoke stencil (using 2x2x4x64/256=4 iterations)
-        for (int b = 0; b < 2 * 2 * 4 * Di; b += 256) {
-            int eid = b + tid;
-            int ii = (eid >> 9) & 0x1;
-            int jj = (eid >> 8) & 0x1;
-            int kk = (eid >> 6) & 0x3;
-            int ee = eid & 0x3f;
-            sSpokeStencil[ii][jj][kk][ee] = sHaloBuffer[di + ii][dj + jj][dk + kk][ee];
-        }
-        __syncthreads();
-
-        // Build all 8 blocks of matrix A from spoke -- each block is assigned to each warp.
-        //  TODO: Get rid of sSpokeStencil and read from sHaloBuffer to sFragBuffer directly??
-        {
-            int inBlockIdx = tid >> 5;
-
-            a_frag_t& a_frag = *reinterpret_cast<a_frag_t*>(sFragBuffer[inBlockIdx][tid & 0x1f]);
-            nvcuda::wmma::load_matrix_sync(a_frag, &sSpokeStencil[0][0][0][inBlockIdx << 3], 64);
-        #pragma unroll
-            for (int t = 0; t < a_frag.num_elements; t++)
-                a_frag.x[t] = nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
-            __syncthreads();
-        }
+        for (int dj = 0; dj <= 2; dj++)
+            for (int dk = 0; dk <= 2; dk++) {
+                // Copy data from Halo to the spoke stencil (using 2x2x4x64/256=4 iterations)
+                for (int b = 0; b < 2 * 2 * 4 * Di; b += 256) {
+                    int eid                       = b + tid;
+                    int ii                        = (eid >> 9) & 0x1;
+                    int jj                        = (eid >> 8) & 0x1;
+                    int kk                        = (eid >> 6) & 0x3;
+                    int ee                        = eid & 0x3f;
+                    sSpokeStencil[ii][jj][kk][ee] = sHaloBuffer[di + ii][dj + jj][dk + kk][ee];
+                }
+                __syncthreads();
+
+                // Build all 8 blocks of matrix A from spoke -- each block is assigned to each warp.
+                //  TODO: Get rid of sSpokeStencil and read from sHaloBuffer to sFragBuffer
+                //  directly??
+                {
+                    int inBlockIdx = tid >> 5;
+
+                    a_frag_t &a_frag =
+                        *reinterpret_cast<a_frag_t *>(sFragBuffer[inBlockIdx][tid & 0x1f]);
+                    nvcuda::wmma::load_matrix_sync(a_frag, &sSpokeStencil[0][0][0][inBlockIdx << 3],
+                                                   64);
+#pragma unroll
+                    for (int t = 0; t < a_frag.num_elements; t++)
+                        a_frag.x[t] = nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
+                    __syncthreads();
+                }
+
+                // For each warp (inBlock), perform 8 times of GEMM to obtain the corresponding
+                // outBlocks
+                for (int b = 0; b < 8 * 8 * 32; b += 256) {
+                    int eid         = tid + b;
+                    int eWarpIdx    = eid >> 5;
+                    int inBlockIdx  = (eWarpIdx >> 3) & 0x7;
+                    int outBlockIdx = eWarpIdx & 0x7;
+
+                    a_frag_t &a_frag =
+                        *reinterpret_cast<a_frag_t *>(sFragBuffer[inBlockIdx][tid & 0x1f]);
+                    nvcuda::wmma::load_matrix_sync(
+                        b_frag, &stencil[di][dj][dk][mIdx][nIdx][inBlockIdx << 3][outBlockIdx << 4],
+                        128);
+#pragma unroll
+                    for (int t = 0; t < b_frag.num_elements; t++)
+                        b_frag.x[t] = nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
+                    __syncthreads();
 
-        // For each warp (inBlock), perform 8 times of GEMM to obtain the corresponding outBlocks
-        for (int b = 0; b < 8 * 8 * 32; b += 256) {
-            int eid = tid + b;
-            int eWarpIdx = eid >> 5;
-            int inBlockIdx = (eWarpIdx >> 3) & 0x7;
-            int outBlockIdx = eWarpIdx & 0x7;
-
-            a_frag_t& a_frag = *reinterpret_cast<a_frag_t*>(sFragBuffer[inBlockIdx][tid & 0x1f]);
-            nvcuda::wmma::load_matrix_sync(b_frag, &stencil[di][dj][dk][mIdx][nIdx][inBlockIdx << 3][outBlockIdx << 4], 128);
-        #pragma unroll
-            for (int t = 0; t < b_frag.num_elements; t++)
-                b_frag.x[t] = nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
-            __syncthreads();
-
-            nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-            __syncthreads();
-        }
-    }
+                    nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+                    __syncthreads();
+                }
+            }
 
     // Store the result to the output buffer
-    nvcuda::wmma::store_matrix_sync(
-        &sOutputBuffer[0][0][0][(tid >> 5) << 4], c_frag, 128, nvcuda::wmma::mem_row_major);
+    nvcuda::wmma::store_matrix_sync(&sOutputBuffer[0][0][0][(tid >> 5) << 4], c_frag, 128,
+                                    nvcuda::wmma::mem_row_major);
 
     // Sparse commit
     __syncthreads();
@@ -319,44 +333,48 @@ void stencilConvHaloLargeDepthKernel(int kM, int kN, int numLeaves,
 
     // NB: this if fixed for 256 threads/block
     const int elementsPerSM = 32;
-    const int xSpanPerSM = 1;
-    const int ySpanPerSM = 2;
+    const int xSpanPerSM    = 1;
+    const int ySpanPerSM    = 2;
 
 #pragma unroll
     for (int xOffset = 0; xOffset < 2; xOffset += xSpanPerSM)
-    for (int yOffset = 0; yOffset < 2; yOffset += ySpanPerSM) {
-        const auto coord = origin.offsetBy(Bi + warpI + xOffset, Bj + warpJ + yOffset, Bk + warpK);
+        for (int yOffset = 0; yOffset < 2; yOffset += ySpanPerSM) {
+            const auto coord =
+                origin.offsetBy(Bi + warpI + xOffset, Bj + warpJ + yOffset, Bk + warpK);
 
-        if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
-            auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
+            if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
+                auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
 
-    #pragma unroll
-            for (int elementOffset = 0; elementOffset < Do; elementOffset += elementsPerSM) {
-                int s = elementID + elementOffset;
-                int tDim = s + nIdx * Do;
-                if (tDim < outFeatures.size(1))
-                    gpuAtomicAddNoReturn(&outFeatures[offset][tDim], sOutputBuffer[warpI + xOffset][warpJ + yOffset][warpK][s]);
-                    // outFeatures[offset][tDim] = sOutputBuffer[warpI + xOffset][warpJ + yOffset][warpK][s];
+#pragma unroll
+                for (int elementOffset = 0; elementOffset < Do; elementOffset += elementsPerSM) {
+                    int s    = elementID + elementOffset;
+                    int tDim = s + nIdx * Do;
+                    if (tDim < outFeatures.size(1))
+                        gpuAtomicAddNoReturn(
+                            &outFeatures[offset][tDim],
+                            sOutputBuffer[warpI + xOffset][warpJ + yOffset][warpK][s]);
+                    // outFeatures[offset][tDim] = sOutputBuffer[warpI + xOffset][warpJ +
+                    // yOffset][warpK][s];
+                }
             }
         }
-    }
 
 #endif // __CUDA_ARCH__ >= 800
 }
 
 template <>
-torch::Tensor dispatchSparseConvolutionHalo<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                          const torch::Tensor& inFeatures,
-                                                          const torch::Tensor& kernel,
-                                                          int variant) {
-
+torch::Tensor
+dispatchSparseConvolutionHalo<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                            const torch::Tensor &inFeatures,
+                                            const torch::Tensor &kernel, int variant) {
     // Check compute capability
     {
-        int device_id = inFeatures.device().index();
+        int            device_id = inFeatures.device().index();
         cudaDeviceProp deviceProp;
         cudaGetDeviceProperties(&deviceProp, device_id);
         int computeCapability = deviceProp.major * 100 + deviceProp.minor * 10;
-        TORCH_CHECK(computeCapability >= 800, "SparseConvolutionHalo requires Ampere (compute capability >= 800)!");
+        TORCH_CHECK(computeCapability >= 800,
+                    "SparseConvolutionHalo requires Ampere (compute capability >= 800)!");
     }
 
     const auto numLeaves = batchHdl.totalLeaves();
@@ -367,7 +385,7 @@ torch::Tensor dispatchSparseConvolutionHalo<torch::kCUDA>(const GridBatchImpl& b
 
     // Output features
     const int outC = kernel.size(4), inC = kernel.size(3);
-    auto outFeatures = torch::zeros({inFeatures.size(0), outC}, inFeatures.options());
+    auto      outFeatures = torch::zeros({ inFeatures.size(0), outC }, inFeatures.options());
 
     // Pad kernel: [3, 3, 3, I, O] -> [3, 3, 3, MxDi, NxDo] -> [3, 3, 3, M, N, Di, Do]
     const int M = (inC + Di - 1) / Di;
@@ -375,11 +393,11 @@ torch::Tensor dispatchSparseConvolutionHalo<torch::kCUDA>(const GridBatchImpl& b
 
     torch::Tensor paddedKernel = kernel;
     if (M * Di != inC || N * Do != outC) {
-        paddedKernel = torch::zeros({3, 3, 3, M * Di, N * Do}, kernel.options());
+        paddedKernel = torch::zeros({ 3, 3, 3, M * Di, N * Do }, kernel.options());
         paddedKernel.slice(3, 0, inC).slice(4, 0, outC) = kernel;
     }
-    paddedKernel = paddedKernel.view({3, 3, 3, M, Di, N, Do});
-    paddedKernel = paddedKernel.permute({0, 1, 2, 3, 5, 4, 6}).contiguous();
+    paddedKernel = paddedKernel.view({ 3, 3, 3, M, Di, N, Do });
+    paddedKernel = paddedKernel.permute({ 0, 1, 2, 3, 5, 4, 6 }).contiguous();
 
     // Launch kernels for each M x N x leaf.
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
@@ -390,35 +408,29 @@ torch::Tensor dispatchSparseConvolutionHalo<torch::kCUDA>(const GridBatchImpl& b
                 M, N, numLeaves, gridAccessor,
                 inFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
                 paddedKernel.packed_accessor64<float, 7, torch::RestrictPtrTraits>(),
-                outFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>()
-            );
+                outFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>());
         } else {
             stencilConvHaloLargeDepthKernel<<<M * N * numLeaves * 32, 256>>>(
                 M, N, numLeaves, gridAccessor,
                 inFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
                 paddedKernel.packed_accessor64<float, 7, torch::RestrictPtrTraits>(),
-                outFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>()
-            );
+                outFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>());
         }
 
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
 
     return outFeatures;
-
 }
 
-
 template <>
-torch::Tensor dispatchSparseConvolutionHalo<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                         const torch::Tensor& inFeatures,
-                                                         const torch::Tensor& kernel,
-                                                         int variant) {
+torch::Tensor
+dispatchSparseConvolutionHalo<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                           const torch::Tensor &inFeatures,
+                                           const torch::Tensor &kernel, int variant) {
     TORCH_CHECK(false, "CPU not supported for SparseConvolutionHalo yet!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
index d1af6b2914..afe51a7080 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionHaloGrad.cu
@@ -1,12 +1,13 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
+#include "ConvOps.h"
+
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
 #include <mma.h>
-
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include <THC/THCAtomics.cuh>
 
 #define COALESCED_MEMORY_ACCESS_VARIANT
 
@@ -15,257 +16,261 @@ namespace detail {
 namespace ops {
 
 struct GradStencilFunctor {
-    static constexpr int Di = 8;
+    static constexpr int Di    = 8;
     static constexpr int logDi = 3;
-    static constexpr int Do = 16;
+    static constexpr int Do    = 16;
 
     static constexpr int MaxThreadsPerBlock = 256;
-    static constexpr int nWarps = MaxThreadsPerBlock / 32;
+    static constexpr int nWarps             = MaxThreadsPerBlock / 32;
 
     struct SharedStorage {
         float inputHaloBuffer[10][10][10][Di];
         float inputSpokeBuffer[4][8][8][2][Di];
         float gradOutputLeafBuffer[8][8][8][Do];
-        float gradStencil[2][Di][Do];   // For 2 spokes at a time
+        float gradStencil[2][Di][Do]; // For 2 spokes at a time
         float warpMatrixC[nWarps][16][16];
     };
 
     template <typename GridType>
-    __device__ void operator()(int kM, int kN, int numLeaves,
-                             BatchGridAccessor<GridType> gridAcc,
-                             TorchRAcc64<float, 2> inFeatures,
-                             TorchRAcc64<float, 2> gradOutFeatures,
-                             TorchRAcc64<float, 5> gradStencil,
-                             char* smemBuffer) {
+    __device__ void
+    operator()(int kM, int kN, int numLeaves, BatchGridAccessor<GridType> gridAcc,
+               TorchRAcc64<float, 2> inFeatures, TorchRAcc64<float, 2> gradOutFeatures,
+               TorchRAcc64<float, 5> gradStencil, char *smemBuffer) {
 // While 700 (Volta) already supports TensorCore, it does not support TF32.
 // 800 (Ampere) supports both TensorCore and TF32.
 #if __CUDA_ARCH__ >= 800
 
-    using MatrixAType = float (&)[8][16];
-    using MatrixBType = float (&)[8][16];
-    using MatrixCType = float (&)[16][16];
-    using MatrixCAltType = float (&)[2][8][16];
-
-    SharedStorage& storage = *reinterpret_cast<SharedStorage*>(smemBuffer);
-    int tid = threadIdx.x;
-    int leafIdx = blockIdx.x % numLeaves;
-    int nIdx = (blockIdx.x / numLeaves) % kN;
-    int mIdx = blockIdx.x / numLeaves / kN;
-
-    // Shared memory buffer
-    auto& sInputHaloBuffer = storage.inputHaloBuffer;
-    auto& sInputSpokeBuffer = storage.inputSpokeBuffer;
-    auto& sGradOutputLeafBuffer = storage.gradOutputLeafBuffer;
-    auto& sGradStencil = storage.gradStencil;
-    auto sWarpMatrixC = storage.warpMatrixC;
-
-    using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
-
-    const int64_t batchIdx = gridAcc.leafBatchIndex(leafIdx);
-    const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
-    const int64_t baseOffset = gridAcc.voxelOffset(batchIdx);
-
-    const nanovdb::NanoGrid<GridType>* deviceGrid = gridAcc.grid(batchIdx);
-    const LeafNodeType& leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
-    const nanovdb::Coord origin = leaf.origin();
-    auto deviceGridAcc = deviceGrid->getAccessor();
-
-    // Dense gathering of 10x10x10 input features and 8x8x8 grad output features
-    // We don't have 1000 threads so have to iterate a bit.
-    for (int b = 0; b < 1000; b += blockDim.x) {
-        int idx = tid + b;
-        if (idx < 1000) {
-            int di = ((idx/100) % 10) - 1;
-            int dj = ((idx/10) % 10) - 1;
-            int dk = (idx % 10) - 1;
-
-            auto coord = origin.offsetBy(di,dj,dk);
-            bool inLeaf = (di >= 0 && di < 8 && dj >= 0 && dj < 8 && dk >= 0 && dk < 8);
-
-            if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
-                auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
-                for (int s = 0; s < Di; s++) {
-                    int tDim = s + mIdx * Di;
-                    sInputHaloBuffer[di+1][dj+1][dk+1][s] = tDim < inFeatures.size(1) ? inFeatures[offset][tDim] : 0.0f;
-                }
-                for (int s = 0; (s < Do) && inLeaf; s++) {
-                    int tDim = s + nIdx * Do;
-                    sGradOutputLeafBuffer[di][dj][dk][s] = tDim < gradOutFeatures.size(1) ? gradOutFeatures[offset][tDim] : 0.0f;
+        using MatrixAType    = float(&)[8][16];
+        using MatrixBType    = float(&)[8][16];
+        using MatrixCType    = float(&)[16][16];
+        using MatrixCAltType = float(&)[2][8][16];
+
+        SharedStorage &storage = *reinterpret_cast<SharedStorage *>(smemBuffer);
+        int            tid     = threadIdx.x;
+        int            leafIdx = blockIdx.x % numLeaves;
+        int            nIdx    = (blockIdx.x / numLeaves) % kN;
+        int            mIdx    = blockIdx.x / numLeaves / kN;
+
+        // Shared memory buffer
+        auto &sInputHaloBuffer      = storage.inputHaloBuffer;
+        auto &sInputSpokeBuffer     = storage.inputSpokeBuffer;
+        auto &sGradOutputLeafBuffer = storage.gradOutputLeafBuffer;
+        auto &sGradStencil          = storage.gradStencil;
+        auto  sWarpMatrixC          = storage.warpMatrixC;
+
+        using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
+
+        const int64_t batchIdx     = gridAcc.leafBatchIndex(leafIdx);
+        const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
+        const int64_t baseOffset   = gridAcc.voxelOffset(batchIdx);
+
+        const nanovdb::NanoGrid<GridType> *deviceGrid = gridAcc.grid(batchIdx);
+        const LeafNodeType  &leaf   = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
+        const nanovdb::Coord origin = leaf.origin();
+        auto                 deviceGridAcc = deviceGrid->getAccessor();
+
+        // Dense gathering of 10x10x10 input features and 8x8x8 grad output features
+        // We don't have 1000 threads so have to iterate a bit.
+        for (int b = 0; b < 1000; b += blockDim.x) {
+            int idx = tid + b;
+            if (idx < 1000) {
+                int di = ((idx / 100) % 10) - 1;
+                int dj = ((idx / 10) % 10) - 1;
+                int dk = (idx % 10) - 1;
+
+                auto coord  = origin.offsetBy(di, dj, dk);
+                bool inLeaf = (di >= 0 && di < 8 && dj >= 0 && dj < 8 && dk >= 0 && dk < 8);
+
+                if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
+                    auto offset = deviceGridAcc.getValue(coord) - 1 + baseOffset;
+                    for (int s = 0; s < Di; s++) {
+                        int tDim = s + mIdx * Di;
+                        sInputHaloBuffer[di + 1][dj + 1][dk + 1][s] =
+                            tDim < inFeatures.size(1) ? inFeatures[offset][tDim] : 0.0f;
+                    }
+                    for (int s = 0; (s < Do) && inLeaf; s++) {
+                        int tDim = s + nIdx * Do;
+                        sGradOutputLeafBuffer[di][dj][dk][s] =
+                            tDim < gradOutFeatures.size(1) ? gradOutFeatures[offset][tDim] : 0.0f;
+                    }
+                } else {
+                    for (int s = 0; s < Di; s++)
+                        sInputHaloBuffer[di + 1][dj + 1][dk + 1][s] = 0.0f;
+                    for (int s = 0; (s < Do) && inLeaf; s++)
+                        sGradOutputLeafBuffer[di][dj][dk][s] = 0.0f;
                 }
             }
-            else {
-                for (int s = 0; s < Di; s++)
-                    sInputHaloBuffer[di+1][dj+1][dk+1][s] = 0.0f;
-                for (int s = 0; (s < Do) && inLeaf; s++)
-                    sGradOutputLeafBuffer[di][dj][dk][s] = 0.0f;
-            }
         }
-    }
 
-    __syncthreads();
+        __syncthreads();
 
-    int outDim = tid & 0xf;
-    int inDim = (tid >> 4) & 0x7;
-    int warpId = threadIdx.x >> 5;
-    int threadWarpID = threadIdx.x & 0x1f;
-    int spokeHalf = (tid >> 7) & 0x1;
+        int outDim       = tid & 0xf;
+        int inDim        = (tid >> 4) & 0x7;
+        int warpId       = threadIdx.x >> 5;
+        int threadWarpID = threadIdx.x & 0x1f;
+        int spokeHalf    = (tid >> 7) & 0x1;
 
-    int fullInDim = inDim + mIdx * Di;
-    int fullOutDim = outDim + nIdx * Do;
-    int fullWithinDim = fullInDim < inFeatures.size(1) && fullOutDim < gradOutFeatures.size(1);
+        int fullInDim     = inDim + mIdx * Di;
+        int fullOutDim    = outDim + nIdx * Do;
+        int fullWithinDim = fullInDim < inFeatures.size(1) && fullOutDim < gradOutFeatures.size(1);
 
-    for (int spokeId = 0; spokeId < 27; spokeId += 2) {
-        sGradStencil[0][0][tid] = 0.;
-        __syncthreads();
+        for (int spokeId = 0; spokeId < 27; spokeId += 2) {
+            sGradStencil[0][0][tid] = 0.;
+            __syncthreads();
 
-        int di = spokeId / 9 - 1;
-        int dj = ((spokeId / 3) % 3) - 1;
-        int dk = (spokeId % 3) - 1;
-        int diNext = (((spokeId+1) / 9) % 3) - 1;
-        int djNext = (((spokeId+1) / 3) % 3) - 1;
-        int dkNext = ((spokeId+1) % 3) - 1;
-
-        // iterate through the 64 1x1x8 sticks in a leaf,
-        // using as many passes necessary with the available threads/warps
-        for (int stickOffset = 0; stickOffset < 64; stickOffset += nWarps) {
-            int sid = stickOffset + warpId;
-            int i = sid >> 3;
-            int I = i & 0x3;
-            int j = sid & 0x7;
-            // TODO: This technically should work both for Di <= 16 (and power of two)
-            // but has only been tested for Di = 8. Must test in other cases
-            for (int elementOffset = 0; elementOffset < 8*Di; elementOffset += 32) {
-                int z = (threadWarpID + elementOffset) >> logDi;
-                int e = (threadWarpID + elementOffset) & (Di-1);
-                sInputSpokeBuffer[I][j][z][0][e] =
-                    sInputHaloBuffer[i+di+1][j+dj+1][dk+1][threadWarpID+elementOffset];
-                sInputSpokeBuffer[I][j][z][1][e] =
-                    sInputHaloBuffer[i+diNext+1][j+djNext+1][dkNext+1][threadWarpID+elementOffset];
-            }
+            int di     = spokeId / 9 - 1;
+            int dj     = ((spokeId / 3) % 3) - 1;
+            int dk     = (spokeId % 3) - 1;
+            int diNext = (((spokeId + 1) / 9) % 3) - 1;
+            int djNext = (((spokeId + 1) / 3) % 3) - 1;
+            int dkNext = ((spokeId + 1) % 3) - 1;
+
+            // iterate through the 64 1x1x8 sticks in a leaf,
+            // using as many passes necessary with the available threads/warps
+            for (int stickOffset = 0; stickOffset < 64; stickOffset += nWarps) {
+                int sid = stickOffset + warpId;
+                int i   = sid >> 3;
+                int I   = i & 0x3;
+                int j   = sid & 0x7;
+                // TODO: This technically should work both for Di <= 16 (and power of two)
+                // but has only been tested for Di = 8. Must test in other cases
+                for (int elementOffset = 0; elementOffset < 8 * Di; elementOffset += 32) {
+                    int z = (threadWarpID + elementOffset) >> logDi;
+                    int e = (threadWarpID + elementOffset) & (Di - 1);
+                    sInputSpokeBuffer[I][j][z][0][e] =
+                        sInputHaloBuffer[i + di + 1][j + dj + 1][dk + 1]
+                                        [threadWarpID + elementOffset];
+                    sInputSpokeBuffer[I][j][z][1][e] =
+                        sInputHaloBuffer[i + diNext + 1][j + djNext + 1][dkNext + 1]
+                                        [threadWarpID + elementOffset];
+                }
 
-            MatrixAType matrixA = reinterpret_cast<MatrixAType>(sInputSpokeBuffer[I][j][0][0][0]);
-            MatrixBType matrixB = reinterpret_cast<MatrixBType>(sGradOutputLeafBuffer[i][j][0][0]);
-            MatrixCType matrixC = reinterpret_cast<MatrixCType>(sWarpMatrixC[warpId][0][0]);
+                MatrixAType matrixA =
+                    reinterpret_cast<MatrixAType>(sInputSpokeBuffer[I][j][0][0][0]);
+                MatrixBType matrixB =
+                    reinterpret_cast<MatrixBType>(sGradOutputLeafBuffer[i][j][0][0]);
+                MatrixCType matrixC = reinterpret_cast<MatrixCType>(sWarpMatrixC[warpId][0][0]);
 
-            nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 8, float> c_frag;
-            nvcuda::wmma::fill_fragment(c_frag, 0.0f);
+                nvcuda::wmma::fragment<nvcuda::wmma::accumulator, 16, 16, 8, float> c_frag;
+                nvcuda::wmma::fill_fragment(c_frag, 0.0f);
 
-            // Declare the fragments
-            nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 8, nvcuda::wmma::precision::tf32, nvcuda::wmma::col_major> a_frag;
-            nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 8, nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major> b_frag;
+                // Declare the fragments
+                nvcuda::wmma::fragment<nvcuda::wmma::matrix_a, 16, 16, 8,
+                                       nvcuda::wmma::precision::tf32, nvcuda::wmma::col_major>
+                    a_frag;
+                nvcuda::wmma::fragment<nvcuda::wmma::matrix_b, 16, 16, 8,
+                                       nvcuda::wmma::precision::tf32, nvcuda::wmma::row_major>
+                    b_frag;
 
-            nvcuda::wmma::load_matrix_sync(a_frag, &matrixA[0][0], 16);
-            nvcuda::wmma::load_matrix_sync(b_frag, &matrixB[0][0], 16);
+                nvcuda::wmma::load_matrix_sync(a_frag, &matrixA[0][0], 16);
+                nvcuda::wmma::load_matrix_sync(b_frag, &matrixB[0][0], 16);
 
 #if 1
 #pragma unroll
-            for (int t = 0; t < a_frag.num_elements; t++)
-                a_frag.x[t] =  nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
+                for (int t = 0; t < a_frag.num_elements; t++)
+                    a_frag.x[t] = nvcuda::wmma::__float_to_tf32(a_frag.x[t]);
 
 #pragma unroll
-            for (int t = 0; t < b_frag.num_elements; t++)
-                b_frag.x[t] =  nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
+                for (int t = 0; t < b_frag.num_elements; t++)
+                    b_frag.x[t] = nvcuda::wmma::__float_to_tf32(b_frag.x[t]);
 #endif
 
-            nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
-            nvcuda::wmma::store_matrix_sync(&matrixC[0][0], c_frag, 16, nvcuda::wmma::mem_row_major);
+                nvcuda::wmma::mma_sync(c_frag, a_frag, b_frag, c_frag);
+                nvcuda::wmma::store_matrix_sync(&matrixC[0][0], c_frag, 16,
+                                                nvcuda::wmma::mem_row_major);
 
-            __syncthreads();
+                __syncthreads();
 
-            for (int w = 0; w < nWarps; w++) {
-                MatrixCAltType matrixCalt = reinterpret_cast<MatrixCAltType>(sWarpMatrixC[w][0][0]);
-                sGradStencil[0][0][tid] += matrixCalt[0][0][tid];
+                for (int w = 0; w < nWarps; w++) {
+                    MatrixCAltType matrixCalt =
+                        reinterpret_cast<MatrixCAltType>(sWarpMatrixC[w][0][0]);
+                    sGradStencil[0][0][tid] += matrixCalt[0][0][tid];
+                }
             }
-        }
-
-        __syncthreads();
 
-        // Do 2 spokes at a time
-        if (spokeId < 26 && fullWithinDim)
-            gpuAtomicAddNoReturn(
-                &gradStencil[di+1][dj+1][dk+1+spokeHalf][fullInDim][fullOutDim],
-                sGradStencil[0][0][tid]
-            );
+            __syncthreads();
 
-    }
+            // Do 2 spokes at a time
+            if (spokeId < 26 && fullWithinDim)
+                gpuAtomicAddNoReturn(
+                    &gradStencil[di + 1][dj + 1][dk + 1 + spokeHalf][fullInDim][fullOutDim],
+                    sGradStencil[0][0][tid]);
+        }
 
-    // Just the last (single) spoke remaining
-    if (tid < 128 && fullWithinDim)
-        gpuAtomicAddNoReturn(&gradStencil[2][2][2][fullInDim][fullOutDim], sGradStencil[0][inDim][outDim]);
+        // Just the last (single) spoke remaining
+        if (tid < 128 && fullWithinDim)
+            gpuAtomicAddNoReturn(&gradStencil[2][2][2][fullInDim][fullOutDim],
+                                 sGradStencil[0][inDim][outDim]);
 
-    __syncthreads();
+        __syncthreads();
 
 #endif // __CUDA_ARCH__ >= 800
 
     } // operator()
-
 };
 
 template <typename GridType>
-__global__ __launch_bounds__(GradStencilFunctor::MaxThreadsPerBlock)
-void stencilConvHaloGradKernel(int kM, int kN, int numLeaves,
-                           BatchGridAccessor<GridType> gridAcc,
-                           TorchRAcc64<float, 2> inFeatures,
-                           TorchRAcc64<float, 2> gradOutFeatures,
-                           TorchRAcc64<float, 5> gradStencil) {
+__global__
+__launch_bounds__(GradStencilFunctor::MaxThreadsPerBlock) void stencilConvHaloGradKernel(
+    int kM, int kN, int numLeaves, BatchGridAccessor<GridType> gridAcc,
+    TorchRAcc64<float, 2> inFeatures, TorchRAcc64<float, 2> gradOutFeatures,
+    TorchRAcc64<float, 5> gradStencil) {
     extern __shared__ char smemBuffer[];
-    GradStencilFunctor()(kM, kN, numLeaves, gridAcc, inFeatures, gradOutFeatures, gradStencil, smemBuffer);
+    GradStencilFunctor()(kM, kN, numLeaves, gridAcc, inFeatures, gradOutFeatures, gradStencil,
+                         smemBuffer);
 }
 
 template <>
-torch::Tensor dispatchSparseConvolutionHaloGrad<torch::kCUDA>(const GridBatchImpl& batchHdl,
-                                                              const torch::Tensor& inFeatures,
-                                                              const torch::Tensor& gradOutFeatures) {
-
+torch::Tensor
+dispatchSparseConvolutionHaloGrad<torch::kCUDA>(const GridBatchImpl &batchHdl,
+                                                const torch::Tensor &inFeatures,
+                                                const torch::Tensor &gradOutFeatures) {
     // Check compute capability
     {
-        int device_id = inFeatures.device().index();
+        int            device_id = inFeatures.device().index();
         cudaDeviceProp deviceProp;
         cudaGetDeviceProperties(&deviceProp, device_id);
         int computeCapability = deviceProp.major * 100 + deviceProp.minor * 10;
-        TORCH_CHECK(computeCapability >= 800, "SparseConvolutionHalo requires Ampere (compute capability >= 800)!");
+        TORCH_CHECK(computeCapability >= 800,
+                    "SparseConvolutionHalo requires Ampere (compute capability >= 800)!");
     }
 
     const auto numLeaves = batchHdl.totalLeaves();
 
     // Kernel Grad size: [3, 3, 3, I, O]
     const int outC = gradOutFeatures.size(1), inC = inFeatures.size(1);
-    auto gradStencil = torch::zeros({3, 3, 3, inC, outC}, inFeatures.options());
-    const int M = (inC + GradStencilFunctor::Di - 1) / GradStencilFunctor::Di;
-    const int N = (outC + GradStencilFunctor::Do - 1) / GradStencilFunctor::Do;
+    auto      gradStencil = torch::zeros({ 3, 3, 3, inC, outC }, inFeatures.options());
+    const int M           = (inC + GradStencilFunctor::Di - 1) / GradStencilFunctor::Di;
+    const int N           = (outC + GradStencilFunctor::Do - 1) / GradStencilFunctor::Do;
 
     constexpr size_t smemSize = sizeof(typename GradStencilFunctor::SharedStorage);
 
     // Launch kernels for each M x N x leaf.
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         auto gridAccessor = batchHdl.deviceAccessor<GridType>();
-        cudaFuncSetAttribute(
-            stencilConvHaloGradKernel<GridType>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize,
-            smemSize);
-
-        stencilConvHaloGradKernel<GridType><<<M * N * numLeaves, GradStencilFunctor::MaxThreadsPerBlock, smemSize>>>(
-            M, N, numLeaves, gridAccessor,
-            inFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
-            gradOutFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
-            gradStencil.packed_accessor64<float, 5, torch::RestrictPtrTraits>()
-        );
+        cudaFuncSetAttribute(stencilConvHaloGradKernel<GridType>,
+                             cudaFuncAttributeMaxDynamicSharedMemorySize, smemSize);
+
+        stencilConvHaloGradKernel<GridType>
+            <<<M * N * numLeaves, GradStencilFunctor::MaxThreadsPerBlock, smemSize>>>(
+                M, N, numLeaves, gridAccessor,
+                inFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+                gradOutFeatures.packed_accessor64<float, 2, torch::RestrictPtrTraits>(),
+                gradStencil.packed_accessor64<float, 5, torch::RestrictPtrTraits>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
 
     return gradStencil;
 }
 
-
 template <>
-torch::Tensor dispatchSparseConvolutionHaloGrad<torch::kCPU>(const GridBatchImpl& batchHdl,
-                                                             const torch::Tensor& inFeatures,
-                                                             const torch::Tensor& gradOutFeatures) {
+torch::Tensor
+dispatchSparseConvolutionHaloGrad<torch::kCPU>(const GridBatchImpl &batchHdl,
+                                               const torch::Tensor &inFeatures,
+                                               const torch::Tensor &gradOutFeatures) {
     TORCH_CHECK(false, "CPU not supported for SparseConvolutionHalo yet!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
-
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
index b6fff1566c..8cbeb3e966 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMM.cu
@@ -1,2840 +1,3360 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
-#include <cuda_fp16.h>
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include "ConvOps.h"
 
+#include <detail/ops/Ops.h>
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <int bytes>
-struct global_load;
+template <int bytes> struct global_load;
 #define XSTR(x) STR(x)
 #define STR(x) #x
 
-
 #pragma message "The value of __CUDA_ARCH__: " XSTR(__CUDA_ARCH__)
 
-template <>
-struct global_load<16>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint4 &data = *reinterpret_cast<uint4 *>(&D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
+template <> struct global_load<16> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint4 &data = *reinterpret_cast<uint4 *>(&D);
+        asm volatile("{\n"
+                     "  .reg .pred p;\n"
+                     "  setp.ne.b32 p, %5, 0;\n"
+                     "  mov.b32 %0, %6;\n"
+                     "  mov.b32 %1, %7;\n"
+                     "  mov.b32 %2, %8;\n"
+                     "  mov.b32 %3, %9;\n"
+                     "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                     "}\n"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z),
+                       "r"(data.w));
+    }
 };
 
-template <>
-struct global_load<8>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
+template <> struct global_load<8> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++)
-    {
-      uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %3, 0;\n"
-          "  mov.b32 %0, %4;\n"
-          "  mov.b32 %1, %5;\n"
-          "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
-          "}\n"
-          : "=r"(data.x), "=r"(data.y)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data.x), "r"(data.y));
+        for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++) {
+            uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %3, 0;\n"
+                         "  mov.b32 %0, %4;\n"
+                         "  mov.b32 %1, %5;\n"
+                         "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+                         "}\n"
+                         : "=r"(data.x), "=r"(data.y)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data.x), "r"(data.y));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<4>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
+template <> struct global_load<4> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++)
-    {
-      unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b32 %0, %3;\n"
-          "  @p ld.global.u32 %0, [%1];\n"
-          "}\n"
-          : "=r"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data));
+        for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++) {
+            unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b32 %0, %3;\n"
+                         "  @p ld.global.u32 %0, [%1];\n"
+                         "}\n"
+                         : "=r"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<2>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
+template <> struct global_load<2> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++)
-    {
-      uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b16 %0, %3;\n"
-          "  @p ld.global.u16 %0, [%1];\n"
-          "}\n"
-          : "=h"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "h"(data));
+        for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++) {
+            uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b16 %0, %3;\n"
+                         "  @p ld.global.u16 %0, [%1];\n"
+                         "}\n"
+                         : "=h"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "h"(data));
+        }
     }
-  }
 };
 
 // Pack two half values.
 static inline __device__ __host__ unsigned
-__pack_half2(const half x, const half y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_half2(const half x, const half y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // Pack two bfloat16 values.
 static inline __device__ __host__ unsigned
-__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_bf16bf16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ out_in_map, __nv_bfloat16 *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  const int K_tile = 16; // min(16, K_original);
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[640];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original);
-  __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  __nv_bfloat16 *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-
-  // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original % 2)) + 2;
-  // TODO: A_ld_start related to i2_0_0
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  //+ (threadIdx.x / 4) * N;
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
-
-  {
-
-    if constexpr (K_ld_check)
-    {
-      // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
-      A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
-      A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-      A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-      A_pred_guard = 0;
-      for (int i = 0; i < A_ld_bound; i++)
-        A_pred_guard |= (1 << i);
-    }
-    else
-    {
-      A_pred_guard = 1;
-    }
-
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-      B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-      B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-      B_pred_guard = 0;
-      for (int i = 0; i < B_ld_bound; i++)
-        B_pred_guard |= (1 << i);
-    }
-
-    // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_original;
-    // half *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_original);
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
-    __nv_bfloat16 *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
-    // half *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-    __nv_bfloat16 *B_ptr_local;
-    if constexpr (K_ld_check)
-      B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-    else
-      B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        /*
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_original));
-          */
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        // global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_original), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-
-    if (threadIdx.y == 0)
-    {
-      /*
-        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        *(uint4 *)(B_ptr_local);
-        */
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
-      *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting1_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume,
+                                                 __nv_bfloat16 *__restrict__ A,
+                                                 __nv_bfloat16 *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 __nv_bfloat16 *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    const int K_tile        = 16; // min(16, K_original);
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[640];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    __syncthreads();
-    __syncthreads();
-    for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int j_factors1 = (N + 15) / 16 / 1;
+    // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 +
+    // threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x *
+    // 8) % 16) / K_original;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+        ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
+    // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original);
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                           threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C
+                           //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                           //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                           + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 +
+                           (threadIdx.x % 4) * 2;
+
+    // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original
+    // % 2)) + 2;
+    // TODO: A_ld_start related to i2_0_0
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
+        B_ld_K_bound = K_original;
+    } else
+        B_pred_guard = 1;
+
+    //+ (threadIdx.x / 4) * N;
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
+
+    {
+        if constexpr (K_ld_check) {
+            // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
+            A_ld_start   = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
+            A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+            A_ld_bound   = A_ld_amount / (K_ld_factor / 2);
+            A_pred_guard = 0;
+            for (int i = 0; i < A_ld_bound; i++)
+                A_pred_guard |= (1 << i);
+        } else {
+            A_pred_guard = 1;
+        }
+
+        if constexpr (K_ld_check || N_ld_check) {
+            B_ld_K      = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+            B_ld_amount = B_ld_amount_N * (int)B_ld_K;
+            B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+            B_pred_guard = 0;
+            for (int i = 0; i < B_ld_bound; i++)
+                B_pred_guard |= (1 << i);
+        }
+
+        // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_original;
+        // half *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_original);
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
+        __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
+        // half *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+        __nv_bfloat16 *B_ptr_local;
+        if constexpr (K_ld_check)
+            B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                   i2_0_0 * K_tile % K_tile_padded) *
+                                      N;
+        else
+            B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+            // (ax0_ax1_fused_0 * 512 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                /*
+                 *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                 *((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = (uint4
+                 **)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) %
+                 *K_original));
+                 */
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                // global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original +
+                // ((ax0_ax1_fused_0 * 512 % 16) % K_original), A_pred_guard);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                        ((((int)threadIdx.x) >> 1) * 40)) +
+                                       ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                        ((((int)threadIdx.x) >> 1) * 40)) +
+                                       ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+
+        if (threadIdx.y == 0) {
+            /*
+             *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40))
+             *+ ((((int)threadIdx.x) & 1) * 8))) = (uint4 *)(B_ptr_local);
+             */
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
+            *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+
+        __syncthreads();
+        __syncthreads();
+        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-    }
+            }
+        }
 
-    {
-      unsigned int addr;
-      __asm__ __volatile__(
-          "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-          : "=r"(addr)
-          : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+        {
+            unsigned int addr;
+            __asm__ __volatile__(
+                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                : "=r"(addr)
+                : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) +
+                                                   ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-      __asm__ __volatile__(
-          "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-          "{%0, %1, %2, %3}, [%4];"
-          : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-          : "r"(addr));
+            __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                 "{%0, %1, %2, %3}, [%4];"
+                                 : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                 : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-    }
-    for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-    {
+        }
+        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-    }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M) {
-          C_ptr[reorder_location_cur * N
-            //+ ax0_0_1 * N / 16 * 256
-            //+ (((local_id / 2) % 2) * 8) * N
-            + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
-        }
-      }
-      else
-      {
-        if (reorder_location_cur < M) {
-          C_ptr[reorder_location_cur * N
-            //+ ax0_0_1 * N / 16 * 256
-            //+ (((local_id / 2) % 2) * 8) * N
-            + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
         }
-      }
-    };
-  }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M) {
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+                }
+            } else {
+                if (reorder_location_cur < M) {
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+                }
+            }
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_f16f16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume, half *__restrict__ A, half *__restrict__ B, int *__restrict__ out_in_map, half *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  const int K_tile = 16; // min(16, K_original);
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[640];
-  half A_shared_warp[32];
-  half B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original);
-  half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  half *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  half *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-
-  // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original % 2)) + 2;
-  // TODO: A_ld_start related to i2_0_0
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  //+ (threadIdx.x / 4) * N;
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
-
-  {
-
-    if constexpr (K_ld_check)
-    {
-      // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
-      A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
-      A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-      A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-      A_pred_guard = 0;
-      for (int i = 0; i < A_ld_bound; i++)
-        A_pred_guard |= (1 << i);
-    }
-    else
-    {
-      A_pred_guard = 1;
-    }
-
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-      B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-      B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-      B_pred_guard = 0;
-      for (int i = 0; i < B_ld_bound; i++)
-        B_pred_guard |= (1 << i);
-    }
-
-    // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_original;
-    // half *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_original);
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
-    half *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
-    // half *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-    half *B_ptr_local;
-    if constexpr (K_ld_check)
-      B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-    else
-      B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        /*
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_original));
-          */
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        // global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_original), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-
-    if (threadIdx.y == 0)
-    {
-      /*
-        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        *(uint4 *)(B_ptr_local);
-        */
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
-      *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting1_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume,
+                                               half *__restrict__ A, half *__restrict__ B,
+                                               int *__restrict__ out_in_map, half *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    const int K_tile        = 16; // min(16, K_original);
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float           C_warp[32];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[640];
+    half            A_shared_warp[32];
+    half            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    __syncthreads();
-    __syncthreads();
-    for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int j_factors1 = (N + 15) / 16 / 1;
+    // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 +
+    // threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x *
+    // 8) % 16) / K_original;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+        ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
+    // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original);
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    half *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                  threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C
+                  //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                  //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                  + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+
+    // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original
+    // % 2)) + 2;
+    // TODO: A_ld_start related to i2_0_0
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
+        B_ld_K_bound = K_original;
+    } else
+        B_pred_guard = 1;
+
+    //+ (threadIdx.x / 4) * N;
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
+
+    {
+        if constexpr (K_ld_check) {
+            // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
+            A_ld_start   = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
+            A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+            A_ld_bound   = A_ld_amount / (K_ld_factor / 2);
+            A_pred_guard = 0;
+            for (int i = 0; i < A_ld_bound; i++)
+                A_pred_guard |= (1 << i);
+        } else {
+            A_pred_guard = 1;
+        }
+
+        if constexpr (K_ld_check || N_ld_check) {
+            B_ld_K      = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+            B_ld_amount = B_ld_amount_N * (int)B_ld_K;
+            B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+            B_pred_guard = 0;
+            for (int i = 0; i < B_ld_bound; i++)
+                B_pred_guard |= (1 << i);
+        }
+
+        // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_original;
+        // half *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_original);
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
+        half *A_ptr_local          = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
+        // half *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+        half *B_ptr_local;
+        if constexpr (K_ld_check)
+            B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                   i2_0_0 * K_tile % K_tile_padded) *
+                                      N;
+        else
+            B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+            // (ax0_ax1_fused_0 * 512 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                /*
+                 *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                 *((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = (uint4
+                 **)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) %
+                 *K_original));
+                 */
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                // global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original +
+                // ((ax0_ax1_fused_0 * 512 % 16) % K_original), A_pred_guard);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                        ((((int)threadIdx.x) >> 1) * 40)) +
+                                       ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                        ((((int)threadIdx.x) >> 1) * 40)) +
+                                       ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+
+        if (threadIdx.y == 0) {
+            /*
+             *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40))
+             *+ ((((int)threadIdx.x) & 1) * 8))) = (uint4 *)(B_ptr_local);
+             */
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
+            *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+
+        __syncthreads();
+        __syncthreads();
+        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-    }
+            }
+        }
 
-    {
-      unsigned int addr;
-      __asm__ __volatile__(
-          "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-          : "=r"(addr)
-          : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+        {
+            unsigned int addr;
+            __asm__ __volatile__(
+                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                : "=r"(addr)
+                : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) +
+                                                   ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-      __asm__ __volatile__(
-          "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-          "{%0, %1, %2, %3}, [%4];"
-          : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-          : "r"(addr));
+            __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                 "{%0, %1, %2, %3}, [%4];"
+                                 : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                   "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                 : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-    }
-    for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-    {
+        }
+        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
 #elif __CUDA_ARCH__ >= 750
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
+            {
+                __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                     "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                     : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                     : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                       "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                       "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                     "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                     : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                     : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                       "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                       "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
+
+            {
+                __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                     "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                     : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                       "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                     : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                       "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                     "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                     : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                       "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                     : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                       "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+        }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+            } else {
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+            }
+        };
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-      else
-      {
-        if (reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-    };
-  }
 }
 
 // conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_bf16bf16f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ out_in_map, __nv_bfloat16 *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[1280];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  __nv_bfloat16 *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    __nv_bfloat16 *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    __nv_bfloat16 *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //   *(uint4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting2_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume,
+                                                 __nv_bfloat16 *__restrict__ A,
+                                                 __nv_bfloat16 *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 __nv_bfloat16 *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    int                      K_implicit = K_original * kernel_volume;
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[1280];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        // original:
-        // *(uint4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-        *(uint4 *)(B_ptr_local);
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 1;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                           threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C
+                           //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                           //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                           + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 +
+                           (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        __nv_bfloat16 *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //   *(uint4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+
+        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                               ((((int)threadIdx.x) & 1) * 8))) =
+            // original:
+            // *(uint4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) *
+            // 8)));
+            *(uint4 *)(B_ptr_local);
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[(
+                                           (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                           (i2_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-      }
-  }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_location_cur * N
-              //+ ax0_0_1 * N / 16 * 256
-              //+ (((local_id / 2) % 2) * 8) * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
-    };
-  }
+            }
+        }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_location_cur * N
+                      //+ ax0_0_1 * N / 16 * 256
+                      //+ (((local_id / 2) % 2) * 8) * N
+                      + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_f16f16f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume, half *__restrict__ A, half *__restrict__ B, int *__restrict__ out_in_map, half *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[1280];
-  half A_shared_warp[32];
-  half B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  half *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  half *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  half *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    half *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    half *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //   *(uint4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting2_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume,
+                                               half *__restrict__ A, half *__restrict__ B,
+                                               int *__restrict__ out_in_map, half *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    int             K_implicit = K_original * kernel_volume;
+    float           C_warp[32];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[1280];
+    half            A_shared_warp[32];
+    half            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        // original:
-        // *(uint4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-        *(uint4 *)(B_ptr_local);
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 1;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    half *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                  threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C
+                  //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                  //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                  + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        half *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        half *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //   *(uint4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+
+        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                               ((((int)threadIdx.x) & 1) * 8))) =
+            // original:
+            // *(uint4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) *
+            // 8)));
+            *(uint4 *)(B_ptr_local);
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[(
+                                           (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                           (i2_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #elif __CUDA_ARCH__ >= 750
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-  }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_location_cur * N
-              //+ ax0_0_1 * N / 16 * 256
-              //+ (((local_id / 2) % 2) * 8) * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-    };
-  }
+            }
+        }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_location_cur * N
+                      //+ ax0_0_1 * N / 16 * 256
+                      //+ (((local_id / 2) % 2) * 8) * N
+                      + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+        };
+    }
 }
 
 // conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_f16f16f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume, half *__restrict__ A, half *__restrict__ B, int *__restrict__ out_in_map, half *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[2304];
-  half A_shared_warp[32];
-  half B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  half *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  half *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N + threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  half *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    half *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    half *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 1024 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-    {
-      // Shang: skip loading B
-      int B_kernel_offset_local = (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+__global__ void
+__launch_bounds__(128)
+    conv_forward_cuda_setting3_mode0_f16f16f32(int M, int K_original, int N, int kernel_volume,
+                                               half *__restrict__ A, half *__restrict__ B,
+                                               int *__restrict__ out_in_map, half *__restrict__ C) {
+    int             K_implicit = K_original * kernel_volume;
+    float           C_warp[64];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[2304];
+    half            A_shared_warp[32];
+    half            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
     }
-    __syncthreads();
-
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
 
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 4;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    half *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                  threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C
+                  //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                  //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                  + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        half *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        half *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                    //  ((((int)threadIdx.x) & 3) * 8)));
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+            // Shang: skip loading B
+            int B_kernel_offset_local =
+                (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) =
+                // original:
+                // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+        }
+        __syncthreads();
+
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[(
+                                           (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                           (i2_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(
+                                           ((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) +
+                                           (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
+                }
+            }
         }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                + ax1_0_1 * 16
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
-      };
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          + ax1_0_1 * 16
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2half(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
 // conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_bf16bf16f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ out_in_map, __nv_bfloat16 *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[2304];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N + threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  __nv_bfloat16 *C_ptr = C
-                //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    __nv_bfloat16 *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    __nv_bfloat16 *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 1024 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-    {
-      // Shang: skip loading B
-      int B_kernel_offset_local = (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+__global__ void
+__launch_bounds__(128)
+    conv_forward_cuda_setting3_mode0_bf16bf16f32(int M, int K_original, int N, int kernel_volume,
+                                                 __nv_bfloat16 *__restrict__ A,
+                                                 __nv_bfloat16 *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 __nv_bfloat16 *__restrict__ C) {
+    int                      K_implicit = K_original * kernel_volume;
+    float                    C_warp[64];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[2304];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
     }
-    __syncthreads();
-
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
 
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 4;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    __nv_bfloat16 *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                           threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C
+                           //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                           //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                           + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 +
+                           (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        __nv_bfloat16 *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                    //  ((((int)threadIdx.x) & 3) * 8)));
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+            // Shang: skip loading B
+            int B_kernel_offset_local =
+                (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) =
+                // original:
+                // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+        }
+        __syncthreads();
+
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[(
+                                           (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                           (i2_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(
+                                           ((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) +
+                                           (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-          }
+                }
+            }
         }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                + ax1_0_1 * 16
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
-      };
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          + ax1_0_1 * 16
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        __float2bfloat16(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_tf32tf32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  const int K_tile = 16; // min(16, K_original);
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[640];
-  float A_shared_warp[32];
-  float B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  // float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original);
-  float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  float *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  float *C_ptr = C
-                 //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                 //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                 + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 3;
-
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
-
-  {
-
-    if constexpr (K_ld_check)
-    {
-      // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
-      A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
-      A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-      A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-      A_pred_guard = 0;
-      for (int i = 0; i < A_ld_bound; i++)
-        A_pred_guard |= (1 << i);
-    }
-    else
-    {
-      A_pred_guard = 3;
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting1_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume,
+                                                 float *__restrict__ A, float *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 float *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    const int K_tile        = 16; // min(16, K_original);
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float            C_warp[32];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[640];
+    float            A_shared_warp[32];
+    float            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-      B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-      B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-      B_pred_guard = 0;
-      for (int i = 0; i < B_ld_bound; i++)
-        B_pred_guard |= (1 << i);
+    // hoisting shared pointer offsets
+    int j_factors1 = (N + 15) / 16 / 1;
+    // int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 +
+    // threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x *
+    // 8) % 16) / K_original;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+        ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded;
+    // float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original);
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    float *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                   threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C
+                   //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                   //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                   + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx.x % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
+        B_ld_K_bound = K_original;
+    } else
+        B_pred_guard = 3;
+
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / K_tile; ++i2_0_0)
+
+    {
+        if constexpr (K_ld_check) {
+            // A_ld_start = (i2_0_0 * K_tile % K_original) + ((threadIdx.x * 8) % 16);
+            A_ld_start   = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8) % 16);
+            A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+            A_ld_bound   = A_ld_amount / (K_ld_factor / 4);
+            A_pred_guard = 0;
+            for (int i = 0; i < A_ld_bound; i++)
+                A_pred_guard |= (1 << i);
+        } else {
+            A_pred_guard = 3;
+        }
+
+        if constexpr (K_ld_check || N_ld_check) {
+            B_ld_K      = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+            B_ld_amount = B_ld_amount_N * (int)B_ld_K;
+            B_ld_bound  = B_ld_amount / (N_ld_factor / 4);
+            B_pred_guard = 0;
+            for (int i = 0; i < B_ld_bound; i++)
+                B_pred_guard |= (1 << i);
+        }
+
+        // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_original;
+        // float *A_ptr_local = A_ptr + (i2_0_0 * 16 % K_original);
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
+        float *A_ptr_local          = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
+        // float *B_ptr_local = B_ptr + i2_0_0 * 16 * N;
+        float *B_ptr_local;
+        if constexpr (K_ld_check)
+            B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                   i2_0_0 * K_tile % K_tile_padded) *
+                                      N;
+        else
+            B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+            // (ax0_ax1_fused_0 * 512 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) *
+                // 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512
+                //    % 16) % K_original));
+                uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<K_ld_factor>(A_loaded[0],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                global_load<K_ld_factor>(A_loaded[1],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded) + 4,
+                                         A_pred_guard >> (4 * 4 / K_ld_factor));
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                  ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(A_loaded);
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                  ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+
+        if (threadIdx.y == 0) {
+            // *(ulonglong4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1)
+            // * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+            //     *(ulonglong4 *)(B_ptr_local);
+            uint4 B_loaded[2] = { make_uint4(0, 0, 0, 0) };
+            global_load<N_ld_factor>(B_loaded[0], B_ptr_local, B_pred_guard);
+            global_load<N_ld_factor>(B_loaded[1], B_ptr_local + 4,
+                                     B_pred_guard >> (4 * 4 / N_ld_factor));
+            *(ulonglong4 *)(B_shared +
+                            (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                             ((((int)threadIdx.x) & 1) * 8))) =
+                *reinterpret_cast<ulonglong4 *>(B_loaded);
+        }
+
+        __syncthreads();
+        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+            for (int local_size = 0; local_size < 8; ++local_size) {
+                A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[(
+                    (((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) +
+                      ((((int)threadIdx.x) >> 2) * 40)) +
+                     ((local_size >> 1) * 4)) +
+                    (((int)threadIdx.x) & 3))];
+            }
+        }
+        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+            B_shared_warp[local_size_1] =
+                B_shared[(((((local_size_1 & 3) * 160) + ((((int)threadIdx.x) & 3) * 40)) +
+                           ((local_size_1 >> 2) * 8)) +
+                          (((int)threadIdx.x) >> 2))];
+        }
+
+        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+#if __CUDA_ARCH__ >= 800
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                      "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                    : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                      "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+            }
+#else
+#pragma message("TF32 kernels will not be compiled.")
+#endif
+        }
     }
-
-    // int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_original;
-    // float *A_ptr_local = A_ptr + (i2_0_0 * 16 % K_original);
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * K_tile / K_tile_padded;
-    float *A_ptr_local = A_ptr + (i2_0_0 * K_tile % K_tile_padded);
-    // float *B_ptr_local = B_ptr + i2_0_0 * 16 * N;
-    float *B_ptr_local;
-    if constexpr (K_ld_check)
-      B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-    else
-      B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_original));
-        uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+            } else {
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+            }
+        };
     }
+}
 
-    if (threadIdx.y == 0)
-    {
-      // *(ulonglong4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-      //     *(ulonglong4 *)(B_ptr_local);
-      uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-      global_load<N_ld_factor>(B_loaded[0], B_ptr_local, B_pred_guard);
-      global_load<N_ld_factor>(B_loaded[1], B_ptr_local + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-      *(ulonglong4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
+// conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_tf32tf32f32
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting2_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume,
+                                                 float *__restrict__ A, float *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 float *__restrict__ C) {
+    // warning: kernel could not work with K_original < 32!
+    int              K_implicit = K_original * kernel_volume;
+    float            C_warp[32];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[1280];
+    float            A_shared_warp[32];
+    float            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
     }
 
-    __syncthreads();
-    for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-    {
-      for (int local_size = 0; local_size < 8; ++local_size)
-      {
-        A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-      }
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 1;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    float *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                   threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C
+                   //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                   //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                   + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        float *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        float *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //   *(ulonglong4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+
+        *(ulonglong4 *)(B_shared +
+                        (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                         ((((int)threadIdx.x) & 1) * 8))) =
+            // original:
+            // *(ulonglong4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) +
+            // (((int)threadIdx.x) * 8)));
+            *(ulonglong4 *)(B_ptr_local);
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax0_0 * 8) + local_size)] =
+                        A_shared[(((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) +
+                                      ((local_size & 1) * 320)) +
+                                     ((((int)threadIdx.x) >> 2) * 40)) +
+                                    (i2_0_1 * 16)) +
+                                   ((local_size >> 1) * 4)) +
+                                  (((int)threadIdx.x) & 3))];
+                }
+            }
+
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+#if __CUDA_ARCH__ >= 800
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+#else
+#pragma message("TF32 kernels will not be compiled.")
+#endif
+            }
+        }
     }
-    for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-    {
-      B_shared_warp[local_size_1] = B_shared[(((((local_size_1 & 3) * 160) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            // original:
+            // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 *
+            // 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) %
+            // 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) +
+            // ((threadIdx.x % 4) * 2)))]
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_location_cur * N
+                      //+ ax0_0_1 * N / 16 * 256
+                      //+ (((local_id / 2) % 2) * 8) * N
+                      + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+        };
+    }
+}
+
+// conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_tf32tf32f32
+__global__ void
+__launch_bounds__(128)
+    conv_forward_cuda_setting3_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume,
+                                                 float *__restrict__ A, float *__restrict__ B,
+                                                 int *__restrict__ out_in_map,
+                                                 float *__restrict__ C) {
+    int              K_implicit = K_original * kernel_volume;
+    float            C_warp[64];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[2304];
+    float            A_shared_warp[32];
+    float            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
     }
 
-    for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-    {
+    // hoisting shared pointer offsets
+    int  j_factors1 = N / 16 / 4;
+    int *out_in_map_ptr =
+        out_in_map +
+        (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+        ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    float *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                   threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset =
+        blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C
+                   //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
+                   //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                   + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+
+    for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
+
+    {
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+        float *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+        float *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // original:
+            // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864))
+            // + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    // original
+                    //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                    //  ((((int)threadIdx.x) & 3) * 8)));
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+            // Shang: skip loading B
+            int B_kernel_offset_local =
+                (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
+
+            *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                         ((((int)threadIdx.x) >> 3) * 72)) +
+                                        ((((int)threadIdx.x) & 7) * 8))) =
+                // original:
+                // *(ulonglong4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                *(ulonglong4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+        }
+        __syncthreads();
+
+        for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax0_0 * 8) + local_size)] =
+                        A_shared[((((((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                      ((local_size & 1) * 320)) +
+                                     ((((int)threadIdx.x) >> 2) * 40)) +
+                                    (i2_0_1 * 16)) +
+                                   ((local_size >> 1) * 4)) +
+                                  (((int)threadIdx.x) & 3))];
+                }
+            }
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[((ax1_0 * 8) + local_size_1)] =
+                        B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                      ((((int)threadIdx.x) & 3) * 72)) +
+                                     ((((int)threadIdx.y) >> 1) * 32)) +
+                                    (ax1_0 * 16)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-      }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+                }
+            }
+        }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_location_cur * N
+                          //+ ax0_0_1 * N / 16 * 256
+                          + ax1_0_1 * 16
+                          //+ (((local_id / 2) % 2) * 8) * N
+                          + (local_id % 2) + (local_id / 4) * 8] =
+                        C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id];
+            };
+        }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-      }
-      else
-      {
-        if (reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-      }
-    };
-  }
 }
+// #endif
 
-// conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_tf32tf32f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-  // warning: kernel could not work with K_original < 32!
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[1280];
-  float A_shared_warp[32];
-  float B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  float *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  float *B_ptr = B + (blockIdx.x % j_factors1) * 16 + threadIdx.y * 256 / 16 * N + threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  float *C_ptr = C
-                 //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                 //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                 + (blockIdx.x % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    float *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    float *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
+// conv_forward_cuda_m128n16k16_f32f32f32
+template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume,
+                                               float *__restrict__ A, float *__restrict__ B,
+                                               int *__restrict__ out_in_map,
+                                               float *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float            C_local[32];
+    __shared__ float A_shared[2048];
+    __shared__ float B_shared[256];
 
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //   *(ulonglong4*)(A + ((input_idx * 16) + ((((int)threadIdx.x) & 1) * 8)));
-            *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
 
-    *(ulonglong4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        // original:
-        // *(ulonglong4*)(B + (((i2_0_0 * 512) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-        *(ulonglong4 *)(B_ptr_local);
+    int K_loops     = K_implicit / 16;
+    int block_num_n = (N - 1) / 16 + 1;
+    int blockIdx_m  = (int)blockIdx.x / block_num_n;
+    int blockIdx_n  = (int)blockIdx.x % block_num_n;
+    int threadIdx_x = (int)threadIdx.x;
+
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (16 / 4))) * kernel_volume;
+
+    float *B_ptr = B + (threadIdx_x / (16 / 4)) * N + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 16);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
+
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
+    // + blockIdx_n * 16 + (threadIdx_x % 4);
+    int location_offset = blockIdx_m * 128 + (threadIdx_x / 4); // C_m_offset
+    int C_n_offset      = blockIdx_n * 16 + (threadIdx_x % 4);
+
+    int channel_offset_A = ((threadIdx_x * 4) % 16);
+
+    // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original
+    // % 2)) + 1;
+    // TODO: A_ld_start related to k_0
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+        B_ld_amount_N = max(0, min(B_ld_start + 4, N) - B_ld_start);
+        // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
+        B_ld_K_bound = K_original;
+    } else
+        B_pred_guard = 1;
 
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[(((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + (i2_0_1 * 16)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-        }
-      }
-
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-#if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
+#pragma unroll
+    for (int k_0 = 0; k_0 < K_loops; ++k_0) {
+        {
+            if constexpr (K_ld_check) {
+                A_ld_start =
+                    (k_0 * K_tile % K_tile_padded) + ((threadIdx.x * 4) % 16); // Channel_offset
+                A_ld_amount  = max(0, min(A_ld_start + 4, K_original) - A_ld_start);
+                A_ld_bound   = A_ld_amount / (K_ld_factor / 4);
+                A_pred_guard = 0;
+                for (int i = 0; i < A_ld_bound; i++)
+                    A_pred_guard |= (1 << i);
+            } else {
+                A_pred_guard = 1;
+            }
+
+            if constexpr (K_ld_check || N_ld_check) {
+                B_ld_K = ((k_0 * K_tile % K_tile_padded) + threadIdx.x * 4 / 16) < B_ld_K_bound;
+                B_ld_amount  = B_ld_amount_N * (int)B_ld_K;
+                B_ld_bound   = B_ld_amount / (N_ld_factor / 4);
+                B_pred_guard = 0;
+                for (int i = 0; i < B_ld_bound; i++)
+                    B_pred_guard |= (1 << i);
+            }
+
+            int   *out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
+            float *A_ptr_local          = A + (k_0 * 16 % K_tile_padded) + channel_offset_A;
+
+            // float *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+            float *B_ptr_local;
+            if constexpr (K_ld_check)
+                B_ptr_local =
+                    B_ptr +
+                    (k_0 * K_tile / K_tile_padded * K_original + k_0 * K_tile % K_tile_padded) * N;
+            else
+                B_ptr_local = B_ptr + k_0 * K_tile * N;
+
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 16) * kernel_volume);
+                if (input_idx != -1) {
+                    // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 *
+                    // elements loaded in each loop
+                    //     *(float4*)(A + (input_idx * K_original) + channel_offset);
+                    uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                    global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original),
+                                             A_pred_guard);
+                    *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
+                } else {
+                    // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0,
+                    // 0.0, 0.0);
+                    *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
+                }
+            }
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1) {
+                // *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 //
+                // ax0_ax1_fused_0_1 * elements loaded in each loop
+                //       *(float4*)(B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N,
+                                         B_pred_guard);
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
+            }
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (16 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 32; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                    }
+                }
+            }
         }
+    }
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int location_cur = location_offset + ((i / 4) * 16);
+        int vn           = C_n_offset + ((i % 4) * 4);
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+        if constexpr (N_ld_check) {
+            if (vn < N && location_cur < M)
+                C[location_cur * N + vn] = C_local[i];
+        } else {
+            if (location_cur < M)
+                C[location_cur * N + vn] = C_local[i];
         }
-#else
-  #pragma message("TF32 kernels will not be compiled.")
-#endif
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-
-      // original:
-      // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_location_cur * N
-              //+ ax0_0_1 * N / 16 * 256
-              //+ (((local_id / 2) % 2) * 8) * N
-              + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-    };
-  }
 }
 
-// conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_tf32tf32f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_tf32tf32f32(int M, int K_original, int N, int kernel_volume, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[2304];
-  float A_shared_warp[32];
-  float B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int *out_in_map_ptr = out_in_map + (blockIdx.x / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original;
-  float *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
-  float *B_ptr = B + (blockIdx.x % j_factors1) * 64 + threadIdx.y * 256 / 64 * N + threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx.x / j_factors1 * 8 * 16 + (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
-  float *C_ptr = C
-                 //+ blockIdx.x / j_factors1 * 8 * N / 16 * 256
-                 //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-                 + (blockIdx.x % j_factors1) * 64 + threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
-  //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-
-  for (int i2_0_0 = 0; i2_0_0 < K_implicit / 32; ++i2_0_0)
-
-  {
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-    float *A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-    float *B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
+// conv_forward_cuda_m128n16k32_f32f32f32
+__global__ void
+__launch_bounds__(64)
+    conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume,
+                                               float *__restrict__ A, float *__restrict__ B,
+                                               int *__restrict__ out_in_map,
+                                               float *__restrict__ C) {
+    float            C_local[32];
+    __shared__ float A_shared[4096];
+    __shared__ float B_shared[512];
 
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // original:
-      // int input_idx = out_in_map[(((((((int)blockIdx.x) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 32 * kernel_volume + (ax0_ax1_fused_0 * 1024 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            // original
-            //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-            *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-    {
-      // Shang: skip loading B
-      int B_kernel_offset_local = (B_kernel_offset + i2_0_0 * 32 + ax0_ax1_fused_0_1 * 1024 / 64) / K_original;
 
-      *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(ulonglong4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(ulonglong4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+    int K_loops     = (K_original * kernel_volume - 1) / 32 + 1;
+    int block_num_n = (N - 1) / 16 + 1;
+    int blockIdx_m  = (int)blockIdx.x / block_num_n;
+    int blockIdx_n  = (int)blockIdx.x % block_num_n;
+    int threadIdx_x = (int)threadIdx.x;
 
-    }
-    __syncthreads();
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (32 / 4))) * kernel_volume;
 
-    for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-    {
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[((((((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + (i2_0_1 * 16)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-        }
-      }
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[((ax1_0 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
-#if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-#else
-  #pragma message("TF32 kernels will not be compiled.")
-#endif
-        }
-      }
+    float *B_ptr = B + (threadIdx_x / (16 / 4)) * N + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
 
-    }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 32);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
 
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx.x) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[reorder_location_cur * N
-                //+ ax0_0_1 * N / 16 * 256
-                + ax1_0_1 * 16
-                //+ (((local_id / 2) % 2) * 8) * N
-                + (local_id % 2) + (local_id / 4) * 8] = C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id];
-      };
-    }
-  }
-}
-//#endif
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
+    // + blockIdx_n * 16 + (threadIdx_x % 4);
+    int location_offset = blockIdx_m * 128 + (threadIdx_x / 4); // C_m_offset
+    int C_n_offset      = blockIdx_n * 16 + (threadIdx_x % 4);
 
-// conv_forward_cuda_m128n16k16_f32f32f32
-template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
-{
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_local[32];
-  __shared__ float A_shared[2048];
-  __shared__ float B_shared[256];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops = K_implicit / 16;
-  int block_num_n = (N - 1) / 16 + 1;
-  int blockIdx_m = (int)blockIdx.x / block_num_n;
-  int blockIdx_n = (int)blockIdx.x % block_num_n;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;
-
-  float * B_ptr = B
-                  + (threadIdx_x / (16/4)) * N
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
-                      // + blockIdx_n * 16 + (threadIdx_x % 4);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 4);  // C_m_offset
-  int C_n_offset = blockIdx_n * 16  + (threadIdx_x % 4);
-
-  int channel_offset_A = ((threadIdx_x * 4) % 16);
-
-  // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original % 2)) + 1;
-  // TODO: A_ld_start related to k_0
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-    B_ld_amount_N = max(0, min(B_ld_start + 4, N) - B_ld_start);
-    // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  #pragma unroll
-  for (int k_0 = 0; k_0 < K_loops; ++k_0) {
+    int channel_offset_A = ((threadIdx_x * 4) % 32);            // mod K_tile=32
 
-    {
-      if constexpr (K_ld_check)
-      {
-        A_ld_start = (k_0 * K_tile % K_tile_padded) + ((threadIdx.x * 4) % 16); // Channel_offset
-        A_ld_amount = max(0, min(A_ld_start + 4, K_original) - A_ld_start);
-        A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-        A_pred_guard = 0;
-        for (int i = 0; i < A_ld_bound; i++)
-          A_pred_guard |= (1 << i);
-      }
-      else
-      {
-        A_pred_guard = 1;
-      }
-
-      if constexpr (K_ld_check || N_ld_check)
-      {
-        B_ld_K = ((k_0 * K_tile % K_tile_padded) + threadIdx.x * 4 / 16) < B_ld_K_bound;
-        B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-        B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-        B_pred_guard = 0;
-        for (int i = 0; i < B_ld_bound; i++)
-          B_pred_guard |= (1 << i);
-      }
-
-      int* out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
-      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;
-
-      // float *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-      float* B_ptr_local;
-      if constexpr (K_ld_check)
-        B_ptr_local = B_ptr + (k_0 * K_tile / K_tile_padded * K_original + k_0 * K_tile % K_tile_padded) * N;
-      else
-        B_ptr_local = B_ptr + k_0 * K_tile * N;
-
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-
-        long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume);
-        if (input_idx != -1)
-        {
-          // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-          //     *(float4*)(A + (input_idx * K_original) + channel_offset);
-          uint4 A_loaded = make_uint4(0, 0, 0, 0);
-          global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original) , A_pred_guard);
-          *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
-        }
-        else
-        {
-          // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-          *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
-        }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1)
-      {
-        // *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-        //       *(float4*)(B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard);
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
-      }
-
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
-        {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 32; ++i)
-          {
-            C_local[i] = C_local[i] +
-                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block]
-                            * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-
-          }
-        }
-      }
-    }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4);
-
-      if constexpr (N_ld_check)
-      {
-        if (vn < N && location_cur < M)
-          C[location_cur * N + vn] = C_local[i];
-      }
-      else
-      {
-        if (location_cur < M)
-          C[location_cur * N + vn] = C_local[i];
-      }
-  }
-}
-
-// conv_forward_cuda_m128n16k32_f32f32f32
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
-{
-  float C_local[32];
-  __shared__ float A_shared[4096];
-  __shared__ float B_shared[512];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 16 + 1;
-  int blockIdx_m = (int)blockIdx.x / block_num_n;
-  int blockIdx_n = (int)blockIdx.x % block_num_n;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
-
-  float * B_ptr = B
-                  + (threadIdx_x / (16/4)) * N
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
-                      // + blockIdx_n * 16 + (threadIdx_x % 4);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 4);  // C_m_offset
-  int C_n_offset = blockIdx_n * 16  + (threadIdx_x % 4);
-
-  int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
-
-  #pragma unroll
-  for (int k_0 = 0; k_0 < K_loops; ++k_0) {
-
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
-    int kernel_offset = k_0 / (K_original / 32);
-    int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
-
-    {
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0)
-      {
+#pragma unroll
+    for (int k_0 = 0; k_0 < K_loops; ++k_0) {
+        int  channel_offset   = k_0 % (K_original / 32) * 32 + channel_offset_A;
+        int  kernel_offset    = k_0 / (K_original / 32);
+        int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
-        long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume);
-        if (input_idx != -1)
         {
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 * 8) * kernel_volume);
+                if (input_idx != -1) {
+                    *(float4 *)(A_shared_ptr +
+                                (ax0_ax1_fused_0 *
+                                 256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                        *(float4 *)(A + (input_idx * K_original) + channel_offset);
+
+                } else {
+                    *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                        make_float4(0.0, 0.0, 0.0, 0.0);
+                }
+            }
 
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-              *(float4*)(A + (input_idx * K_original) + channel_offset);
-
-        }
-        else {
-
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+                *(float4 *)(B_shared_ptr +
+                            (ax0_ax1_fused_0_1 *
+                             256)) = // ax0_ax1_fused_0_1 * elements loaded in each loop
+                    *(float4 *)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
+            }
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (32 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 32; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                    }
+                }
+            }
         }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-      {
-
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
-
-      }
+    }
 
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
-        {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 32; ++i)
-          {
-            C_local[i] = C_local[i] +
-                            A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block]
-                            * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-
-          }
-        }
-      }
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int location_cur = location_offset + ((i / 4) * 16);
+        int vn           = C_n_offset + ((i % 4) * 4);
+        if (location_cur < M)
+            C[location_cur * N + vn] = C_local[i];
     }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4);
-      if (location_cur < M)
-        C[location_cur * N + vn] = C_local[i];
-   }
 }
 
 // conv_forward_cuda_m128n64k32_f32f32f32
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume, float* __restrict__ A, float* __restrict__ B, int* __restrict__ out_in_map, float* __restrict__ C)
-{
-  float C_local[64];
-  __shared__ float A_shared[4096];
-  __shared__ float B_shared[2048];
-
-  #pragma unroll
-  for (int i = 0; i < 64; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_num_n = (N - 1) / 64 + 1;
-  int blockIdx_m = (int)blockIdx.x / block_num_n;
-  int blockIdx_n = (int)blockIdx.x % block_num_n;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
-
-  float * B_ptr = B
-                  + (threadIdx_x / (64/4)) * N
-                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 16)) * N
-                      // + blockIdx_n * 64 + (threadIdx_x % 16);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 16);  // C_m_offset
-  int C_n_offset = blockIdx_n * 64  + (threadIdx_x % 16);
-
-  int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
-
-  #pragma unroll
-  for (int k_0 = 0; k_0 < K_loops; ++k_0) {
-
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
-    int kernel_offset = k_0 / (K_original / 32);
-    int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
+__global__ void
+__launch_bounds__(128)
+    conv_forward_cuda_setting3_mode0_f32f32f32(int M, int K_original, int N, int kernel_volume,
+                                               float *__restrict__ A, float *__restrict__ B,
+                                               int *__restrict__ out_in_map,
+                                               float *__restrict__ C) {
+    float            C_local[64];
+    __shared__ float A_shared[4096];
+    __shared__ float B_shared[2048];
 
-    {
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
+#pragma unroll
+    for (int i = 0; i < 64; ++i) {
+        C_local[i] = 0.0;
+    }
 
-        long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume);
-        if (input_idx != -1)
-        {
+    int K_loops     = (K_original * kernel_volume - 1) / 32 + 1;
+    int block_num_n = (N - 1) / 64 + 1;
+    int blockIdx_m  = (int)blockIdx.x / block_num_n;
+    int blockIdx_n  = (int)blockIdx.x % block_num_n;
+    int threadIdx_x = (int)threadIdx.x;
 
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-              *(float4*)(A + (input_idx * K_original) + channel_offset);
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (32 / 4))) * kernel_volume;
 
-        }
-        else {
+    float *B_ptr = B + (threadIdx_x / (64 / 4)) * N + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
 
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) = make_float4(0.0, 0.0, 0.0, 0.0);
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 16) * 32);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
 
-        }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-      {
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 16)) * N
+    // + blockIdx_n * 64 + (threadIdx_x % 16);
+    int location_offset = blockIdx_m * 128 + (threadIdx_x / 16); // C_m_offset
+    int C_n_offset      = blockIdx_n * 64 + (threadIdx_x % 16);
 
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 512)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
+    int channel_offset_A = ((threadIdx_x * 4) % 32);             // mod K_tile=32
 
-      }
+#pragma unroll
+    for (int k_0 = 0; k_0 < K_loops; ++k_0) {
+        int  channel_offset   = k_0 % (K_original / 32) * 32 + channel_offset_A;
+        int  kernel_offset    = k_0 / (K_original / 32);
+        int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
 
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
         {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 64; ++i)
-          {
-            C_local[i] = C_local[i] +
-                            A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block]
-                            * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-
-          }
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 * 16) * kernel_volume);
+                if (input_idx != -1) {
+                    *(float4 *)(A_shared_ptr +
+                                (ax0_ax1_fused_0 *
+                                 512)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                        *(float4 *)(A + (input_idx * K_original) + channel_offset);
+
+                } else {
+                    *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =
+                        make_float4(0.0, 0.0, 0.0, 0.0);
+                }
+            }
+
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+                *(float4 *)(B_shared_ptr +
+                            (ax0_ax1_fused_0_1 *
+                             512)) = // ax0_ax1_fused_0_1 * elements loaded in each loop
+                    *(float4 *)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
+            }
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (32 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 64; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                    }
+                }
+            }
         }
-      }
     }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 64; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 8);
-      int vn = C_n_offset + ((i % 4) * 16);
-      if (location_cur < M)
-        C[location_cur * N + vn] = C_local[i];
-   }
-}
 
-template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMM<torch::kCUDA>(
-    torch::Tensor _in_feats,
-    torch::Tensor _kernel,
-    torch::Tensor _out_in_map,
-    int num_out_feats, int num_out_channels,
-    bool allow_tf32, bool allow_fp16)
-{
-  bool is_tf = allow_tf32;
-  int num_in_feats = _in_feats.size(0);
-  int num_in_channels = _in_feats.size(1);
-  int kernel_volume = _out_in_map.size(1);
-
-  auto options =
-      torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
-  at::Tensor _out_feats = torch::empty({num_out_feats, num_out_channels}, options);
-
-  auto out_in_map = _out_in_map.data_ptr<int>();
-  bool is_half = _in_feats.scalar_type() == at::ScalarType::Half;
-  bool is_bfloat16 = _in_feats.scalar_type() == at::ScalarType::BFloat16;
-
-  if (is_half)
-  {
-    if (!allow_fp16)
-    {
-      throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
+#pragma unroll
+    for (int i = 0; i < 64; ++i) {
+        int location_cur = location_offset + ((i / 4) * 8);
+        int vn           = C_n_offset + ((i % 4) * 16);
+        if (location_cur < M)
+            C[location_cur * N + vn] = C_local[i];
     }
-    auto in_feats = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
-    auto kernel = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
-    auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+}
 
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f16f16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-    }
-  }
-  else if (is_bfloat16)
-  {
-    //if (!allow_bfloat16)
-    //{
-    //  throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
-    //}
-    auto in_feats = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
-    auto kernel = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
-    auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-    }
-  }
-  else if (is_tf)
-  {
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-    }
-  }
-  else  // fp32fp32fp32
-  {
-    // printf("\n Run FP32 kernel! \n");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = num_out_channels / 64;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N);
-      dim3 threads_per_block(128);
-      conv_forward_cuda_setting3_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = num_out_channels / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N);
-      dim3 threads_per_block(64);
-      conv_forward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
-          _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = (num_out_channels + 15) / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N);
-      dim3 threads_per_block(64);
-      // conv_forward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode0_f32f32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, out_in_map, out_feats);
+template <>
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMM<torch::kCUDA>(torch::Tensor _in_feats, torch::Tensor _kernel,
+                                                    torch::Tensor _out_in_map, int num_out_feats,
+                                                    int num_out_channels, bool allow_tf32,
+                                                    bool allow_fp16) {
+    bool is_tf           = allow_tf32;
+    int  num_in_feats    = _in_feats.size(0);
+    int  num_in_channels = _in_feats.size(1);
+    int  kernel_volume   = _out_in_map.size(1);
+
+    auto       options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats = torch::empty({ num_out_feats, num_out_channels }, options);
+
+    auto out_in_map  = _out_in_map.data_ptr<int>();
+    bool is_half     = _in_feats.scalar_type() == at::ScalarType::Half;
+    bool is_bfloat16 = _in_feats.scalar_type() == at::ScalarType::BFloat16;
+
+    if (is_half) {
+        if (!allow_fp16) {
+            throw std::runtime_error(
+                "FP16 kernels are not supported for implicit GEMM now for SM75-.");
+        }
+        auto in_feats  = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
+        auto kernel    = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
+        auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f16f16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else if (is_bfloat16) {
+        // if (!allow_bfloat16)
+        //{
+        //   throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for
+        //   SM75-.");
+        // }
+        auto in_feats  = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
+        auto kernel    = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
+        auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_bf16bf16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else if (is_tf) {
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_out_feats + 127) / 128 * j_factors1);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_tf32tf32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else // fp32fp32fp32
+    {
+        // printf("\n Run FP32 kernel! \n");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = num_out_channels / 64; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N);
+            dim3 threads_per_block(128);
+            conv_forward_cuda_setting3_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = num_out_channels / 16; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N);
+            dim3 threads_per_block(64);
+            conv_forward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
+                _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+                kernel, out_in_map, out_feats);
+        } else {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = (num_out_channels + 15) / 16; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N);
+            dim3 threads_per_block(64);
+            // conv_forward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, out_in_map, out_feats);
+
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode0_f32f32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            in_feats, kernel, out_in_map, out_feats);
+                }
+            }
         }
-      }
     }
-  }
-  return _out_feats;
+    return _out_feats;
 }
 
-
 template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMM<torch::kCPU>(
-    torch::Tensor in_feat, torch::Tensor kernel, torch::Tensor out_in_map,
-    int num_out_feats, int num_out_channels,
-    bool allow_tf32, bool allow_fp16) {
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMM<torch::kCPU>(torch::Tensor in_feat, torch::Tensor kernel,
+                                                   torch::Tensor out_in_map, int num_out_feats,
+                                                   int num_out_channels, bool allow_tf32,
+                                                   bool allow_fp16) {
     TORCH_CHECK(false, "No support for CPU-based ImplicitGEMM!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
index 9b8235b17f..96d9b9558e 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGrad.cu
@@ -1,2894 +1,3475 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
-#include <cuda_fp16.h>
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include "ConvOps.h"
+
+#include <detail/ops/Ops.h>
 
+#include <cuda_fp16.h>
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <int bytes>
-struct global_load;
-
-template <>
-struct global_load<16>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint4 &data = *reinterpret_cast<uint4 *>(&D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
+template <int bytes> struct global_load;
+
+template <> struct global_load<16> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint4 &data = *reinterpret_cast<uint4 *>(&D);
+        asm volatile("{\n"
+                     "  .reg .pred p;\n"
+                     "  setp.ne.b32 p, %5, 0;\n"
+                     "  mov.b32 %0, %6;\n"
+                     "  mov.b32 %1, %7;\n"
+                     "  mov.b32 %2, %8;\n"
+                     "  mov.b32 %3, %9;\n"
+                     "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                     "}\n"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z),
+                       "r"(data.w));
+    }
 };
 
-template <>
-struct global_load<8>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
+template <> struct global_load<8> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++)
-    {
-      uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %3, 0;\n"
-          "  mov.b32 %0, %4;\n"
-          "  mov.b32 %1, %5;\n"
-          "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
-          "}\n"
-          : "=r"(data.x), "=r"(data.y)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data.x), "r"(data.y));
+        for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++) {
+            uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %3, 0;\n"
+                         "  mov.b32 %0, %4;\n"
+                         "  mov.b32 %1, %5;\n"
+                         "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+                         "}\n"
+                         : "=r"(data.x), "=r"(data.y)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data.x), "r"(data.y));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<4>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
+template <> struct global_load<4> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++)
-    {
-      unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b32 %0, %3;\n"
-          "  @p ld.global.u32 %0, [%1];\n"
-          "}\n"
-          : "=r"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data));
+        for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++) {
+            unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b32 %0, %3;\n"
+                         "  @p ld.global.u32 %0, [%1];\n"
+                         "}\n"
+                         : "=r"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<2>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
+template <> struct global_load<2> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++)
-    {
-      uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b16 %0, %3;\n"
-          "  @p ld.global.u16 %0, [%1];\n"
-          "}\n"
-          : "=h"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "h"(data));
+        for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++) {
+            uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b16 %0, %3;\n"
+                         "  @p ld.global.u16 %0, [%1];\n"
+                         "}\n"
+                         : "=h"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "h"(data));
+        }
     }
-  }
 };
 
 // Pack two half values.
 static inline __device__ __host__ unsigned
-__pack_half2(const half x, const half y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_half2(const half x, const half y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // Pack two bfloat16 values.
 static inline __device__ __host__ unsigned
-__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_bf16bf16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode0_bf16bf16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ out_in_map, __nv_bfloat16 *__restrict__ C)
-{
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ __nv_bfloat16 A_shared[2560];
-  __shared__ __nv_bfloat16 B_shared[2560];
-  __nv_bfloat16 A_shared_warp[8];
-  __nv_bfloat16 B_shared_warp[8];
-  __nv_bfloat16 *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  };
-
-  // hoisting shared pointer offsets
-  // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y / j_factors1 * 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
-  __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  __nv_bfloat16 *B_ptr = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
-  // half *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N / 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  __nv_bfloat16 *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-  // int A_pred_guard = 1;
-  // int B_pred_guard = 1;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16 *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_offset_inner;
-      //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-      //    *(uint4 *)(B_ptr + v0 * N);
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(32)
+    conv_backward_cuda_setting1_mode0_bf16bf16f32(int M_fwd, int K_original, int N,
+                                                  int kernel_volume, int split_k_iters,
+                                                  __nv_bfloat16 *__restrict__ A,
+                                                  __nv_bfloat16 *__restrict__ B,
+                                                  int *__restrict__ out_in_map,
+                                                  __nv_bfloat16 *__restrict__ C) {
+    int j_factors1 = (N + 15) / 16 / 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float                    C_warp[8];
+    __shared__ __nv_bfloat16 A_shared[2560];
+    __shared__ __nv_bfloat16 B_shared[2560];
+    __nv_bfloat16            A_shared_warp[8];
+    __nv_bfloat16            B_shared_warp[8];
+    __nv_bfloat16           *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    };
+
+    // hoisting shared pointer offsets
+    // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+    // ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y
+    // / j_factors1 * 16) / K_original;
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
+    __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                           ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                           ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    __nv_bfloat16 *B_ptr          = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int            reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    // half *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N /
+    // 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 +
+    // (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x
+    // / 4) * N;
+    int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
+    int kernel_offset  = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    __nv_bfloat16 *C_ptr =
+        cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 2);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
+
+    // int A_pred_guard = 1;
+    // int B_pred_guard = 1;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+            // (ax0_ax1_fused_0 * 256 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 %
+                //    16) % K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int v0                   = reorder_offset_inner;
+            //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40))
+            //+ ((((int)threadIdx.x) & 1) * 8))) =
+            //    *(uint4 *)(B_ptr + v0 * N);
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
+        }
     }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16 *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(B_ptr + v0 * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-      }
-      else
-      {
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+            // (ax0_ax1_fused_0 * 256 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 %
+                //    16) % K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(B_ptr + v0 * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            } else {
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
+        }
     }
-  }
 
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[0 + local_id]);
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[0 + local_id]);
-    }
-  };
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2bfloat16(C_warp[0 + local_id]);
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                __float2bfloat16(C_warp[0 + local_id]);
+        }
+    };
 }
 
-
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_f16f16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode0_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, half *__restrict__ A, half *__restrict__ B, int *__restrict__ out_in_map, half *__restrict__ C)
-{
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ half A_shared[2560];
-  __shared__ half B_shared[2560];
-  half A_shared_warp[8];
-  half B_shared_warp[8];
-  half *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  };
-
-  // hoisting shared pointer offsets
-  // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y / j_factors1 * 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
-  half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  half *B_ptr = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
-  // half *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N / 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  half *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-  // int A_pred_guard = 1;
-  // int B_pred_guard = 1;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_offset_inner;
-      //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-      //    *(uint4 *)(B_ptr + v0 * N);
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(32)
+    conv_backward_cuda_setting1_mode0_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume,
+                                                int split_k_iters, half *__restrict__ A,
+                                                half *__restrict__ B, int *__restrict__ out_in_map,
+                                                half *__restrict__ C) {
+    int j_factors1 = (N + 15) / 16 / 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float           C_warp[8];
+    __shared__ half A_shared[2560];
+    __shared__ half B_shared[2560];
+    half            A_shared_warp[8];
+    half            B_shared_warp[8];
+    half           *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    };
+
+    // hoisting shared pointer offsets
+    // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+    // ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y
+    // / j_factors1 * 16) / K_original;
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    // half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
+    half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                  ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                  ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    half *B_ptr          = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int   reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    // half *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N /
+    // 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 +
+    // (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x
+    // / 4) * N;
+    int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
+    int kernel_offset  = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    half *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 2);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
+
+    // int A_pred_guard = 1;
+    // int B_pred_guard = 1;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+            // (ax0_ax1_fused_0 * 256 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 %
+                //    16) % K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int v0                   = reorder_offset_inner;
+            //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40))
+            //+ ((((int)threadIdx.x) & 1) * 8))) =
+            //    *(uint4 *)(B_ptr + v0 * N);
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #elif __CUDA_ARCH__ >= 750
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+        }
     }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(uint4 *)(B_ptr + v0 * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-      }
-      else
-      {
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+            // (ax0_ax1_fused_0 * 256 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 %
+                //    16) % K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                //*(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) *
+                // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(uint4 *)(B_ptr + v0 * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            } else {
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #elif __CUDA_ARCH__ >= 750
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+        }
     }
-  }
 
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[0 + local_id]);
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[0 + local_id]);
-    }
-  };
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2half(C_warp[0 + local_id]);
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                __float2half(C_warp[0 + local_id]);
+        }
+    };
 }
 
 // conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_bf16bf16f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode0_bf16bf16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ out_in_map, __nv_bfloat16 *__restrict__ C)
-{
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[2560];
-  __shared__ __nv_bfloat16 B_shared[4608];
-  __nv_bfloat16 A_shared_warp[16];
-  __nv_bfloat16 B_shared_warp[16];
-  __nv_bfloat16 *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original + (blockIdx_y / j_factors1 * 32) / K_original;
-  __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original) + ((blockIdx_y / j_factors1 * 32) % K_original);
-  __nv_bfloat16 *B_ptr = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-  __nv_bfloat16 *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 2 * N / 16 * 256 + (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16 *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_offset_inner;
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4 *)(B_ptr + v0 * N);
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(64)
+    conv_backward_cuda_setting2_mode0_bf16bf16f32(int M_fwd, int K_original, int N,
+                                                  int kernel_volume, int split_k_iters,
+                                                  __nv_bfloat16 *__restrict__ A,
+                                                  __nv_bfloat16 *__restrict__ B,
+                                                  int *__restrict__ out_in_map,
+                                                  __nv_bfloat16 *__restrict__ C) {
+    int j_factors1 = N / 16 / 4;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[2560];
+    __shared__ __nv_bfloat16 B_shared[4608];
+    __nv_bfloat16            A_shared_warp[16];
+    __nv_bfloat16            B_shared_warp[16];
+    __nv_bfloat16           *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                           ((threadIdx.x * 8 % 32) % K_original) +
+                           ((blockIdx_y / j_factors1 * 32) % K_original);
+    __nv_bfloat16 *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int            reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    __nv_bfloat16 *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                           blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                           (threadIdx.y % 1) * 2 * N / 16 * 256 +
+                           (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 +
+                           threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
+    int K_iters       = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_offset_inner;
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) = *(uint4 *)(B_ptr + v0 * N);
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-        }
-      }
-    }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16 *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-            *(uint4 *)(B_ptr + v0 * N);
-      }
-      else
-      {
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+        }
+    }
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    *(uint4 *)(B_ptr + v0 * N);
+            } else {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
 
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        __nv_bfloat16 *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
 
-    __nv_bfloat16 *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
-
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
-      };
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] =
+                    __float2bfloat16(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
 // conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_f16f16f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode0_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, half *__restrict__ A, half *__restrict__ B, int *__restrict__ out_in_map, half *__restrict__ C)
-{
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-
-  float C_warp[32];
-  __shared__ half A_shared[2560];
-  __shared__ half B_shared[4608];
-  half A_shared_warp[16];
-  half B_shared_warp[16];
-  half *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original + (blockIdx_y / j_factors1 * 32) / K_original;
-  half *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original) + ((blockIdx_y / j_factors1 * 32) % K_original);
-  half *B_ptr = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-  half *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 2 * N / 16 * 256 + (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_offset_inner;
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4 *)(B_ptr + v0 * N);
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(64)
+    conv_backward_cuda_setting2_mode0_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume,
+                                                int split_k_iters, half *__restrict__ A,
+                                                half *__restrict__ B, int *__restrict__ out_in_map,
+                                                half *__restrict__ C) {
+    int j_factors1 = N / 16 / 4;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+
+    float           C_warp[32];
+    __shared__ half A_shared[2560];
+    __shared__ half B_shared[4608];
+    half            A_shared_warp[16];
+    half            B_shared_warp[16];
+    half           *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    half *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                  ((threadIdx.x * 8 % 32) % K_original) +
+                  ((blockIdx_y / j_factors1 * 32) % K_original);
+    half *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int   reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    half *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                  blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                  (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 +
+                  (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 +
+                  (threadIdx.x / 4) * N;
+    int K_iters       = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_offset_inner;
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) = *(uint4 *)(B_ptr + v0 * N);
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-    }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-            *(uint4 *)(B_ptr + v0 * N);
-      }
-      else
-      {
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+        }
+    }
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    *(uint4 *)(B_ptr + v0 * N);
+            } else {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};\n"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
 
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        half *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
 
-    half *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
-
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
-      };
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] =
+                    __float2half(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_tf32tf32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode0_tf32tf32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ float A_shared[2560];
-  __shared__ float B_shared[2560];
-  float A_shared_warp[8];
-  float B_shared_warp[8];
-  float *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  };
-
-  // hoisting shared pointer offsets
-  // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y / j_factors1 * 16) / K_original;
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume + ((threadIdx.y * 256) % 16) / K_tile_padded + ((threadIdx.x * 8) % 16) / K_tile_padded + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  // float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) % K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
-  float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  float *B_ptr = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
-  // float *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N / 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    // load twice
-    A_pred_guard = 3;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 3;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
-
-      if (input_idx != -1)
-      {
-        //*(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_offset_inner;
-      //*(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-      //    *(ulonglong4 *)(B_ptr + v0 * N);
-      uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-      global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
-      global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-      *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int local_size = 0; local_size < 8; ++local_size)
-      {
-        A_shared_warp[local_size] = A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
+__global__ void
+__launch_bounds__(32)
+    conv_backward_cuda_setting1_mode0_tf32tf32f32(int M_fwd, int K_original, int N,
+                                                  int kernel_volume, int split_k_iters,
+                                                  float *__restrict__ A, float *__restrict__ B,
+                                                  int *__restrict__ out_in_map,
+                                                  float *__restrict__ C) {
+    int j_factors1 = (N + 15) / 16 / 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float            C_warp[8];
+    __shared__ float A_shared[2560];
+    __shared__ float B_shared[2560];
+    float            A_shared_warp[8];
+    float            B_shared_warp[8];
+    float           *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    };
+
+    // hoisting shared pointer offsets
+    // int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+    // ((threadIdx.y * 256) % 16) / K_original + ((threadIdx.x * 8) % 16) / K_original + (blockIdx_y
+    // / j_factors1 * 16) / K_original;
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    // float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_original) + ((threadIdx.x * 8 % 16) %
+    // K_original) + ((blockIdx_y / j_factors1 * 16) % K_original);
+    float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                   ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                   ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    float *B_ptr          = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int    reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    // float *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 1 * N
+    // / 16 * 256 + (threadIdx.y % 1) * 1 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 16 +
+    // (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2 + (threadIdx.x
+    // / 4) * N;
+    int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    // int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 16);
+    int kernel_offset  = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 4);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        // load twice
+        A_pred_guard = 3;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 4);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 3;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            // int input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+            // (ax0_ax1_fused_0 * 256 % 16) / K_original];
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                //*(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1)
+                //* 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256
+                //    % 16) % K_original));
+                uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<K_ld_factor>(A_loaded[0],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                global_load<K_ld_factor>(A_loaded[1],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4,
+                                         A_pred_guard >> (4 * 4 / K_ld_factor));
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(A_loaded);
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int v0                   = reorder_offset_inner;
+            //*(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) *
+            // 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+            //    *(ulonglong4 *)(B_ptr + v0 * N);
+            uint4 B_loaded[2] = { make_uint4(0, 0, 0, 0) };
+            global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
+            global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4,
+                                     B_pred_guard >> (4 * 4 / N_ld_factor));
+            *(ulonglong4 *)(B_shared +
+                            (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                             ((((int)threadIdx.x) & 1) * 8))) =
+                *reinterpret_cast<ulonglong4 *>(B_loaded);
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int local_size = 0; local_size < 8; ++local_size) {
+                A_shared_warp[local_size] =
+                    A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size & 1) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+        }
     }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 256 % 16) / K_original];
-
-      if (input_idx != -1)
-      {
-        //*(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_original));
-        uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        //*(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        //    *(ulonglong4 *)(B_ptr + v0 * N);
-        uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
-        global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-        *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int local_size = 0; local_size < 8; ++local_size)
-      {
-        A_shared_warp[local_size] = A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_original];
+
+            if (input_idx != -1) {
+                //*(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1)
+                //* 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256
+                //    % 16) % K_original));
+                uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<K_ld_factor>(A_loaded[0],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                global_load<K_ld_factor>(A_loaded[1],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4,
+                                         A_pred_guard >> (4 * 4 / K_ld_factor));
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(A_loaded);
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                //*(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >>
+                // 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
+                //    *(ulonglong4 *)(B_ptr + v0 * N);
+                uint4 B_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
+                global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4,
+                                         B_pred_guard >> (4 * 4 / N_ld_factor));
+                *(ulonglong4 *)(B_shared +
+                                (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(B_loaded);
+            } else {
+                *(ulonglong4 *)(B_shared +
+                                (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int local_size = 0; local_size < 8; ++local_size) {
+                A_shared_warp[local_size] =
+                    A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size & 1) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+        }
     }
-  }
-
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
 
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[0 + local_id];
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[0 + local_id];
-    }
-  };
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    C_warp[0 + local_id];
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                C_warp[0 + local_id];
+        }
+    };
 }
 
 // conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_tf32tf32f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode0_tf32tf32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-
-  float C_warp[32];
-  __shared__ float A_shared[2560];
-  __shared__ float B_shared[4608];
-  float A_shared_warp[16];
-  float B_shared_warp[16];
-  float *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume + ((threadIdx.y * 256) % 32) / K_original + ((threadIdx.x * 8) % 32) / K_original + (blockIdx_y / j_factors1 * 32) / K_original;
-  float *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original) + ((blockIdx_y / j_factors1 * 32) % K_original);
-  float *B_ptr = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
-  float *C_ptr = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 + blockIdx_y / j_factors1 * 2 * N / 16 * 256 + (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_offset_inner;
-      *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(ulonglong4 *)(B_ptr + v0 * N);
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax1_0 * 8) + local_size)] = A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + (ax1_0 * 16)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[((ax1_0_1 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+__global__ void
+__launch_bounds__(64)
+    conv_backward_cuda_setting2_mode0_tf32tf32f32(int M_fwd, int K_original, int N,
+                                                  int kernel_volume, int split_k_iters,
+                                                  float *__restrict__ A, float *__restrict__ B,
+                                                  int *__restrict__ out_in_map,
+                                                  float *__restrict__ C) {
+    int j_factors1 = N / 16 / 4;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+
+    float            C_warp[32];
+    __shared__ float A_shared[2560];
+    __shared__ float B_shared[4608];
+    float            A_shared_warp[16];
+    float            B_shared_warp[16];
+    float           *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    float *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                   ((threadIdx.x * 8 % 32) % K_original) +
+                   ((blockIdx_y / j_factors1 * 32) % K_original);
+    float *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int    reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    float *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                   blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                   (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 +
+                   (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 +
+                   (threadIdx.x / 4) * N;
+    int K_iters       = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_offset_inner;
+            *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                         ((((int)threadIdx.x) >> 3) * 72)) +
+                                        ((((int)threadIdx.x) & 7) * 8))) =
+                *(ulonglong4 *)(B_ptr + v0 * N);
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax1_0 * 8) + local_size)] =
+                        A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                     ((((int)threadIdx.x) & 3) * 40)) +
+                                    (ax1_0 * 16)) +
+                                   ((local_size & 1) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[((ax1_0_1 * 8) + local_size_1)] =
+                        B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                      ((((int)threadIdx.x) & 3) * 72)) +
+                                     (((int)threadIdx.y) * 32)) +
+                                    (ax1_0_1 * 16)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
-        }
-      }
-    }
-  }
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float *A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume + (ax0_ax1_fused_0 * 512 % 32) / K_original];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(ulonglong4 *)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-            *(ulonglong4 *)(B_ptr + v0 * N);
-      }
-      else
-      {
-        *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax1_0 * 8) + local_size)] = A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + (ax1_0 * 16)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[((ax1_0_1 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+        }
+    }
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+
+            if (reorder_offset_inner < M_fwd) {
+                int v0                                            = reorder_offset_inner;
+                *(ulonglong4 *)(B_shared +
+                                ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                  ((((int)threadIdx.x) >> 3) * 72)) +
+                                 ((((int)threadIdx.x) & 7) * 8))) = *(ulonglong4 *)(B_ptr + v0 * N);
+            } else {
+                *(ulonglong4 *)(B_shared +
+                                ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                  ((((int)threadIdx.x) >> 3) * 72)) +
+                                 ((((int)threadIdx.x) & 7) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax1_0 * 8) + local_size)] =
+                        A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                     ((((int)threadIdx.x) & 3) * 40)) +
+                                    (ax1_0 * 16)) +
+                                   ((local_size & 1) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[((ax1_0_1 * 8) + local_size_1)] =
+                        B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                      ((((int)threadIdx.x) & 3) * 72)) +
+                                     (((int)threadIdx.y) * 32)) +
+                                    (ax1_0_1 * 16)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
-
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
 
-    float *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        float *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
 
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id];
-      };
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] = C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id];
+            };
+        }
     }
-  }
 }
 
 // conv_backward_cuda_m16n16k64_f32f32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode0_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-
-  int j_factors1 = (N + 15) / 16;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_local[8];
-  __shared__ float A_shared[1024];
-  __shared__ float B_shared[1024];
-
-  #pragma unroll
-  for (int i = 0; i < 8; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  int kernel_offset = blockIdx_m / (K_tile_padded / 16);
-  int channel_offset = (blockIdx_m * 16 + ((threadIdx_x * 4) % 16)) % K_tile_padded;
-  int K_loops = ((M_fwd + 63 ) / 64 + split_k_iters - 1) / split_k_iters;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                          + (threadIdx_x / (16/4)) * kernel_volume
-                          + kernel_offset;
-  float * A_ptr = A + channel_offset;
-
-  // reorder is performed on B's rows.
-  float * B_ptr = B
-                    + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-  int reorder_offset = threadIdx_x /(16/4);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-
-  float * A_shared_reduce_ptr =  A_shared + (threadIdx_x / 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // splitK offset
-  float * cur_C = C + blockIdx_z * K_original * kernel_volume * N;
-  int cur_C_ic_start = (blockIdx_m * 16 + (threadIdx_x / 4)) % K_tile_padded;
-  int cur_C_oc_start = blockIdx_n * 16 + (threadIdx_x % 4);
-  float * C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check) // IC % cta_M != 0
-  {
-    int A_ld_start = channel_offset;
-    int A_ld_amount = min(A_ld_start + 4, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-
-  if constexpr (N_ld_check) // OC % cta_N != 0
-  {
-    int B_ld_start = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-    int B_ld_amount = min(B_ld_start + 4, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-  #pragma unroll
-  for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 *8) * kernel_volume];
-      if (input_idx != -1)
-      {
-        // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-        //     *(float4*)(A_ptr + (input_idx * K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original) , A_pred_guard);
-        *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
-      }
-      else
-      {
-        *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
-      }
-    }
+__global__ void
+__launch_bounds__(32)
+    conv_backward_cuda_setting1_mode0_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume,
+                                                int split_k_iters, float *__restrict__ A,
+                                                float *__restrict__ B, int *__restrict__ out_in_map,
+                                                float *__restrict__ C) {
+    int j_factors1 = (N + 15) / 16;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float            C_local[8];
+    __shared__ float A_shared[1024];
+    __shared__ float B_shared[1024];
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
-      int v0 = reorder_offset_inner;
-      //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
-      //    *(float4*)(B_ptr + v0 * N);
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
+#pragma unroll
+    for (int i = 0; i < 8; ++i) {
+        C_local[i] = 0.0;
     }
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 8; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)]
-                          * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-        }
-
-      }
-    }
-  }
-  for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    if (k_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *8) * kernel_volume);
-      if (input_idx != -1)
-      {
-        // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-        //     *(float4*)(A_ptr + (input_idx * K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original) , A_pred_guard);
-        *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
-      }
-      else
-      {
-        *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
-      }
-    }
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
+
+    int kernel_offset  = blockIdx_m / (K_tile_padded / 16);
+    int channel_offset = (blockIdx_m * 16 + ((threadIdx_x * 4) % 16)) % K_tile_padded;
+    int K_loops        = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+
+    // hoisting shared pointer offsets
+    int   *out_in_map_ptr = out_in_map + (threadIdx_x / (16 / 4)) * kernel_volume + kernel_offset;
+    float *A_ptr          = A + channel_offset;
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
+    // reorder is performed on B's rows.
+    float *B_ptr          = B + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+    int    reorder_offset = threadIdx_x / (16 / 4);
+
+    float *A_shared_ptr = A_shared + (threadIdx_x * 4);
+    float *B_shared_ptr = B_shared + (threadIdx_x * 4);
+
+    float *A_shared_reduce_ptr = A_shared + (threadIdx_x / 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
+
+    // splitK offset
+    float *cur_C          = C + blockIdx_z * K_original * kernel_volume * N;
+    int    cur_C_ic_start = (blockIdx_m * 16 + (threadIdx_x / 4)) % K_tile_padded;
+    int    cur_C_oc_start = blockIdx_n * 16 + (threadIdx_x % 4);
+    float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) // IC % cta_M != 0
     {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
-        //    *(float4*)(B_ptr + v0 * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
-
-      }
-      else
-      {
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = make_uint4(0, 0, 0, 0);
-      }
-    }
+        int A_ld_start  = channel_offset;
+        int A_ld_amount = min(A_ld_start + 4, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 4);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
+    if constexpr (N_ld_check) // OC % cta_N != 0
     {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 8; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)]
-                          * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-        }
-
-      }
+        int B_ld_start  = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+        int B_ld_amount = min(B_ld_start + 4, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 4);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
+
+#pragma unroll
+    for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0) {
+        int  k_0                  = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 8) * kernel_volume];
+            if (input_idx != -1) {
+                // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 *
+                // elements loaded in each loop
+                //     *(float4*)(A_ptr + (input_idx * K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original), A_pred_guard);
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
+            int v0                   = reorder_offset_inner;
+            //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
+            //    *(float4*)(B_ptr + v0 * N);
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 8; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                }
+            }
+        }
     }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 8; ++i)
-  {
-    int local_row = ((i / 4) * 8);
-    int local_col = ((i % 4) * 4);
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if ( ((cur_C_ic_start + local_row) < K_original) && ((cur_C_oc_start + local_col) < N) )
-        C_ptr[local_row * N + local_col] = C_local[i];
+    for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0) {
+        int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        if (k_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 8) * kernel_volume);
+            if (input_idx != -1) {
+                // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 *
+                // elements loaded in each loop
+                //     *(float4*)(A_ptr + (input_idx * K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original), A_pred_guard);
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
+            }
+        }
 
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_offset_inner;
+                //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
+                //    *(float4*)(B_ptr + v0 * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
+
+            } else {
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 8; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                }
+            }
+        }
     }
-    else
-    {
-      C_ptr[local_row * N + local_col] = C_local[i];
+
+#pragma unroll
+    for (int i = 0; i < 8; ++i) {
+        int local_row = ((i / 4) * 8);
+        int local_col = ((i % 4) * 4);
+        if constexpr (K_ld_check || N_ld_check) {
+            if (((cur_C_ic_start + local_row) < K_original) && ((cur_C_oc_start + local_col) < N))
+                C_ptr[local_row * N + local_col] = C_local[i];
+
+        } else {
+            C_ptr[local_row * N + local_col] = C_local[i];
+        }
     }
-  }
 }
 
 // conv_backward_cuda_m32n64k64_f32f32f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode0_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, float *__restrict__ A, float *__restrict__ B, int *__restrict__ out_in_map, float *__restrict__ C)
-{
-
-  int j_factors1 = (N + 63) / 64;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-
-  float C_local[32];
-  __shared__ float A_shared[2048];
-  __shared__ float B_shared[4096];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  int kernel_offset = blockIdx_m / (K_original / 32);
-  int channel_offset = (blockIdx_m * 32 + ((threadIdx_x * 4) % 32)) % K_original;
-  int K_loops = ((M_fwd + 63 ) / 64 + split_k_iters - 1) / split_k_iters;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                          + (threadIdx_x / (32/4)) * kernel_volume
-                          + kernel_offset;
-  float * A_ptr = A + channel_offset;
-
-  // reorder is performed on B's rows.
-  float * B_ptr = B
-                    + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
-  int reorder_offset = threadIdx_x /(64/4);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-
-  float * A_shared_reduce_ptr =  A_shared + (threadIdx_x / 16);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
-
-  // splitK offset
-  float * cur_C = C + blockIdx_z * K_original * kernel_volume * N;
-  int C_m_offset = blockIdx_m * 32 + (threadIdx_x / 16);  // C_m_offset
-  int C_n_offset = blockIdx_n * 64  + (threadIdx_x % 16);
-  // float * C_ptr = cur_C + C_m_offset * N + C_n_offset;
-
-  #pragma unroll
-  for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 *8) * kernel_volume];
-      if (input_idx != -1)
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-            *(float4*)(A_ptr + (input_idx * K_original));
-      }
-      else
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+__global__ void
+__launch_bounds__(64)
+    conv_backward_cuda_setting2_mode0_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume,
+                                                int split_k_iters, float *__restrict__ A,
+                                                float *__restrict__ B, int *__restrict__ out_in_map,
+                                                float *__restrict__ C) {
+    int j_factors1 = (N + 63) / 64;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+
+    float            C_local[32];
+    __shared__ float A_shared[2048];
+    __shared__ float B_shared[4096];
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
-      int v0 = reorder_offset_inner;
-      *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
-          *(float4*)(B_ptr + v0 * N);
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 32; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)]
-                          * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-        }
-
-      }
-    }
-  }
-  for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    if (k_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *8) * kernel_volume);
-      if (input_idx != -1)
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-            *(float4*)(A_ptr + (input_idx * K_original));
-      }
-      else
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_offset_inner;
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
-            *(float4*)(B_ptr + v0 * N);
-      }
-      else
-      {
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+    int kernel_offset  = blockIdx_m / (K_original / 32);
+    int channel_offset = (blockIdx_m * 32 + ((threadIdx_x * 4) % 32)) % K_original;
+    int K_loops        = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 32; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)]
-                          * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-        }
-
-      }
-    }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int C_m_offset_cur = C_m_offset + ((i / 4) * 4);
-      int C_n_offset_cur = C_n_offset + ((i % 4) * 16);
-      cur_C[C_m_offset_cur * N + C_n_offset_cur] = C_local[i];
-  }
-}
+    // hoisting shared pointer offsets
+    int   *out_in_map_ptr = out_in_map + (threadIdx_x / (32 / 4)) * kernel_volume + kernel_offset;
+    float *A_ptr          = A + channel_offset;
 
+    // reorder is performed on B's rows.
+    float *B_ptr          = B + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
+    int    reorder_offset = threadIdx_x / (64 / 4);
 
-template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMGrad<torch::kCUDA>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, const int split_k_iters,
-    bool allow_tf32, bool allow_fp16)
-{
-  bool is_tf = allow_tf32;
-  int num_in_feats = _in_feats.size(0);
-  int num_in_channels = _in_feats.size(1);
-  int kernel_volume = _out_in_map.size(1);
-  auto options =
-      torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
-  at::Tensor _out_feats = torch::empty({split_k_iters, num_in_channels * kernel_volume, _kernel.size(1)}, options);
-  int num_out_feats = _out_feats.size(1);
-  int num_out_channels = _out_feats.size(2);
-  auto out_in_map = _out_in_map.data_ptr<int>();
-  bool is_half = _in_feats.scalar_type() == at::ScalarType::Half;
-  bool is_bfloat16 = _in_feats.scalar_type() == at::ScalarType::BFloat16;
-
-  if (is_half)
-  {
-    if (!allow_fp16)
-    {
-      throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
-    }
-    auto in_feats = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
-    auto kernel = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
-    auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+    float *A_shared_ptr = A_shared + (threadIdx_x * 4);
+    float *B_shared_ptr = B_shared + (threadIdx_x * 4);
 
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks( num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-          _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks((num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f16f16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-    }
-  }
-  else if (is_bfloat16)
-  {
-    //if (!allow_fp16)
-    //{
-    //  throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
-    //}
-    auto in_feats = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
-    auto kernel = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
-    auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks( num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-          _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks((num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-    }
-  }
-  else if (is_tf)
-  {
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks( num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-          _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks( (num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
+    float *A_shared_reduce_ptr = A_shared + (threadIdx_x / 16);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
+
+    // splitK offset
+    float *cur_C      = C + blockIdx_z * K_original * kernel_volume * N;
+    int    C_m_offset = blockIdx_m * 32 + (threadIdx_x / 16); // C_m_offset
+    int    C_n_offset = blockIdx_n * 64 + (threadIdx_x % 16);
+    // float * C_ptr = cur_C + C_m_offset * N + C_n_offset;
+
+#pragma unroll
+    for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0) {
+        int  k_0                  = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 8) * kernel_volume];
+            if (input_idx != -1) {
+                *(float4 *)(A_shared_ptr +
+                            (ax0_ax1_fused_0 *
+                             256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                    *(float4 *)(A_ptr + (input_idx * K_original));
+            } else {
+                *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
+
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
+            int v0                   = reorder_offset_inner;
+            *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = *(float4 *)(B_ptr + v0 * N);
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 32; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                }
+            }
+        }
     }
-  }
-  else // fp32fp32fp32
-  {
-    // printf("\nRun FP32 wgrad backward kernels!\n");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int block_num_M = (num_in_channels * kernel_volume) / 32;
-      int block_num_N = (num_out_channels) / 64; //j_factors1
+    for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0) {
+        int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        if (k_0 >= (M_fwd + 63) / 64)
+            break;
+
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 8) * kernel_volume);
+            if (input_idx != -1) {
+                *(float4 *)(A_shared_ptr +
+                            (ax0_ax1_fused_0 *
+                             256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                    *(float4 *)(A_ptr + (input_idx * K_original));
+            } else {
+                *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
 
-      dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
-      dim3 threads_per_block(64);
-      conv_backward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
-          _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
+            if (reorder_offset_inner < M_fwd) {
+                int v0                                                = reorder_offset_inner;
+                *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = *(float4 *)(B_ptr + v0 * N);
+            } else {
+                *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 32; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                }
+            }
+        }
     }
-    else
-    {
-      int block_num_M = (num_in_channels + 15) / 16 * kernel_volume;
-      int block_num_N = (num_out_channels - 1) / 16 + 1;
-
-      dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
-      dim3 threads_per_block(32);
-      // conv_backward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode0_f32f32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, out_in_map, out_feats);
-        }
-      }
+
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int C_m_offset_cur                         = C_m_offset + ((i / 4) * 4);
+        int C_n_offset_cur                         = C_n_offset + ((i % 4) * 16);
+        cur_C[C_m_offset_cur * N + C_n_offset_cur] = C_local[i];
     }
-  }
-  return _out_feats.sum(0);
 }
 
-
+template <>
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMGrad<torch::kCUDA>(torch::Tensor _in_feats,
+                                                        torch::Tensor _kernel,
+                                                        torch::Tensor _out_in_map,
+                                                        const int split_k_iters, bool allow_tf32,
+                                                        bool allow_fp16) {
+    bool       is_tf           = allow_tf32;
+    int        num_in_feats    = _in_feats.size(0);
+    int        num_in_channels = _in_feats.size(1);
+    int        kernel_volume   = _out_in_map.size(1);
+    auto       options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats =
+        torch::empty({ split_k_iters, num_in_channels * kernel_volume, _kernel.size(1) }, options);
+    int  num_out_feats    = _out_feats.size(1);
+    int  num_out_channels = _out_feats.size(2);
+    auto out_in_map       = _out_in_map.data_ptr<int>();
+    bool is_half          = _in_feats.scalar_type() == at::ScalarType::Half;
+    bool is_bfloat16      = _in_feats.scalar_type() == at::ScalarType::BFloat16;
+
+    if (is_half) {
+        if (!allow_fp16) {
+            throw std::runtime_error(
+                "FP16 kernels are not supported for implicit GEMM now for SM75-.");
+        }
+        auto in_feats  = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
+        auto kernel    = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
+        auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                in_feats, kernel, out_in_map, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f16f16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else if (is_bfloat16) {
+        // if (!allow_fp16)
+        //{
+        //   throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for
+        //   SM75-.");
+        // }
+        auto in_feats  = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
+        auto kernel    = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
+        auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode0_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                in_feats, kernel, out_in_map, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode0_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_bf16bf16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else if (is_tf) {
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                in_feats, kernel, out_in_map, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks((num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_tf32tf32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    } else // fp32fp32fp32
+    {
+        // printf("\nRun FP32 wgrad backward kernels!\n");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int block_num_M = (num_in_channels * kernel_volume) / 32;
+            int block_num_N = (num_out_channels) / 64; // j_factors1
+
+            dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
+            dim3 threads_per_block(64);
+            conv_backward_cuda_setting2_mode0_f32f32f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                in_feats, kernel, out_in_map, out_feats);
+        } else {
+            int block_num_M = (num_in_channels + 15) / 16 * kernel_volume;
+            int block_num_N = (num_out_channels - 1) / 16 + 1;
+
+            dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
+            dim3 threads_per_block(32);
+            // conv_backward_cuda_setting1_mode0_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, out_in_map, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode0_f32f32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, in_feats, kernel, out_in_map, out_feats);
+                }
+            }
+        }
+    }
+    return _out_feats.sum(0);
+}
 
 template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMGrad<torch::kCPU>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, const int split_k_iters,
-    bool allow_tf32, bool allow_fp16) {
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMGrad<torch::kCPU>(torch::Tensor _in_feats,
+                                                       torch::Tensor _kernel,
+                                                       torch::Tensor _out_in_map,
+                                                       const int split_k_iters, bool allow_tf32,
+                                                       bool allow_fp16) {
     TORCH_CHECK(false, "No support for CPU-based ImplicitGEMM!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
index 4ae86e25af..87786c9704 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMGradSorted.cu
@@ -1,3084 +1,3567 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
-#include <cuda_fp16.h>
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include "ConvOps.h"
+
+#include <detail/ops/Ops.h>
 
+#include <cuda_fp16.h>
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <int bytes>
-struct global_load;
-
-template <>
-struct global_load<16>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint4 &data = *reinterpret_cast<uint4 *>(&D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
+template <int bytes> struct global_load;
+
+template <> struct global_load<16> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint4 &data = *reinterpret_cast<uint4 *>(&D);
+        asm volatile("{\n"
+                     "  .reg .pred p;\n"
+                     "  setp.ne.b32 p, %5, 0;\n"
+                     "  mov.b32 %0, %6;\n"
+                     "  mov.b32 %1, %7;\n"
+                     "  mov.b32 %2, %8;\n"
+                     "  mov.b32 %3, %9;\n"
+                     "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                     "}\n"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z),
+                       "r"(data.w));
+    }
 };
 
-template <>
-struct global_load<8>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
+template <> struct global_load<8> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++)
-    {
-      uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %3, 0;\n"
-          "  mov.b32 %0, %4;\n"
-          "  mov.b32 %1, %5;\n"
-          "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
-          "}\n"
-          : "=r"(data.x), "=r"(data.y)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data.x), "r"(data.y));
+        for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++) {
+            uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %3, 0;\n"
+                         "  mov.b32 %0, %4;\n"
+                         "  mov.b32 %1, %5;\n"
+                         "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+                         "}\n"
+                         : "=r"(data.x), "=r"(data.y)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data.x), "r"(data.y));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<4>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
+template <> struct global_load<4> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++)
-    {
-      unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b32 %0, %3;\n"
-          "  @p ld.global.u32 %0, [%1];\n"
-          "}\n"
-          : "=r"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data));
+        for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++) {
+            unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b32 %0, %3;\n"
+                         "  @p ld.global.u32 %0, [%1];\n"
+                         "}\n"
+                         : "=r"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<2>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
+template <> struct global_load<2> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++)
-    {
-      uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b16 %0, %3;\n"
-          "  @p ld.global.u16 %0, [%1];\n"
-          "}\n"
-          : "=h"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "h"(data));
+        for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++) {
+            uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b16 %0, %3;\n"
+                         "  @p ld.global.u16 %0, [%1];\n"
+                         "}\n"
+                         : "=h"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "h"(data));
+        }
     }
-  }
 };
 
 // Pack two bfloat16 values.
 static inline __device__ __host__ unsigned
-__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // Pack two half values.
 static inline __device__ __host__ unsigned
-__pack_half2(const half x, const half y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_half2(const half x, const half y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_bf16bf16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode1_bf16bf16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ __nv_bfloat16 A_shared[2560];
-  __shared__ __nv_bfloat16 B_shared[2560];
-  __nv_bfloat16 A_shared_warp[8];
-  __nv_bfloat16 B_shared_warp[8];
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  __nv_bfloat16 *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 16
-      + threadIdx.x / 2
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded
-    + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  __nv_bfloat16* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded)
-    + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  __nv_bfloat16* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16
-    + threadIdx.x * 8 / 16;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  __nv_bfloat16 *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          B_loaded;
-    }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(32) conv_backward_cuda_setting1_mode1_bf16bf16f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A,
+    __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map,
+    int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float                    C_warp[8];
+    __shared__ __nv_bfloat16 A_shared[2560];
+    __shared__ __nv_bfloat16 B_shared[2560];
+    __nv_bfloat16            A_shared_warp[8];
+    __nv_bfloat16            B_shared_warp[8];
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    }
+
+    // hoisting shared pointer offsets
+    int            j_factors1 = (N + 15) / 16 / 1;
+    int            blockIdx_x = 0;
+    int            blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int            blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    __nv_bfloat16 *cur_C      = C + blockIdx_z * kernel_volume * K_original * N;
+    int *out_in_map_ptr       = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                           ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                           ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    __nv_bfloat16 *B_ptr          = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int            reorder_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int            K_iters        = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int            kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int            split_mask_iter  = kernel_offset / split_mask_len;
+    int           *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int           *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int            bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+    int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    __nv_bfloat16 *C_ptr =
+        cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 2);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool           bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int   reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int   v0                   = reorder_loc_ptr[reorder_offset_inner];
+            uint4 B_loaded             = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-    }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          B_loaded;
         }
-        else
-        {
-          *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
-        }
-      }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
+    }
 
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool           bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            if (reorder_offset_inner < M_fwd) {
+                int   v0       = reorder_loc_ptr[reorder_offset_inner];
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            } else {
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
+        }
     }
-  }
 
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[0 + local_id]);
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[0 + local_id]);
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2bfloat16(C_warp[0 + local_id]);
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                __float2bfloat16(C_warp[0 + local_id]);
+        }
     }
-  }
 }
 
-
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_f16f16f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode1_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ half A_shared[2560];
-  __shared__ half B_shared[2560];
-  half A_shared_warp[8];
-  half B_shared_warp[8];
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  half *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 16
-      + threadIdx.x / 2
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded
-    + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  half* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded)
-    + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  half* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16
-    + threadIdx.x * 8 / 16;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  half *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          B_loaded;
-    }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(32) conv_backward_cuda_setting1_mode1_f16f16f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    half *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float           C_warp[8];
+    __shared__ half A_shared[2560];
+    __shared__ half B_shared[2560];
+    half            A_shared_warp[8];
+    half            B_shared_warp[8];
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    }
+
+    // hoisting shared pointer offsets
+    int   j_factors1     = (N + 15) / 16 / 1;
+    int   blockIdx_x     = 0;
+    int   blockIdx_y     = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int   blockIdx_z     = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    half *cur_C          = C + blockIdx_z * kernel_volume * K_original * N;
+    int  *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    half *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                  ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                  ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    half *B_ptr            = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int   reorder_offset   = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int   K_iters          = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int   kernel_offset    = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int   split_mask_iter  = kernel_offset / split_mask_len;
+    int  *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int  *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int   bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+    int   cur_C_ic_start   = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int   cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    half *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 2);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 2);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool  bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int   reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int   v0                   = reorder_loc_ptr[reorder_offset_inner];
+            uint4 B_loaded             = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #elif __CUDA_ARCH__ >= 750
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-    }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-      }
-      else
-      {
-        *(uint4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-          B_loaded;
         }
-        else
-        {
-          *(uint4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
-        }
-      }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
+    }
 
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool  bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded,
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared +
+                           (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            if (reorder_offset_inner < M_fwd) {
+                int   v0       = reorder_loc_ptr[reorder_offset_inner];
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            } else {
+                *(uint4 *)(B_shared +
+                           (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(A_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(A_shared_warp + 0))[0]), "=r"(((unsigned *)(A_shared_warp + 0))[2]), "=r"(((unsigned *)(A_shared_warp + 0))[1]), "=r"(((unsigned *)(A_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(A_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(A_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                   (((((int)threadIdx.x) & 15) * 40) +
+                                    ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #elif __CUDA_ARCH__ >= 750
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-          "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-          "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-          :  "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-          : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                    "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+        }
     }
-  }
 
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[0 + local_id]);
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[0 + local_id]);
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    __float2half(C_warp[0 + local_id]);
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                __float2half(C_warp[0 + local_id]);
+        }
     }
-  }
 }
 
-
 // conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_bf16bf16f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode1_bf16bf16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C)
-{
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[2560];
-  __shared__ __nv_bfloat16 B_shared[4608];
-  __nv_bfloat16 A_shared_warp[16];
-  __nv_bfloat16 B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  __nv_bfloat16 *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 8
-      + threadIdx.x / 4
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original
-    + (blockIdx_y / j_factors1 * 32) / K_original;
-  __nv_bfloat16* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original)
-    + ((blockIdx_y / j_factors1 * 32) % K_original);
-  __nv_bfloat16* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64
-    + threadIdx.x * 8 / 64;
-  __nv_bfloat16* C_ptr = cur_C
-    + blockIdx_x / 1 * 108 * N / 16 * 256
-    + blockIdx_y / j_factors1 * 2 * N / 16 * 256
-    + (threadIdx.y % 1) * 2 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 1 * 32
-    + (threadIdx.x % 4) * 2
-    + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4 *)(B_ptr + v0 * N);
-    }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(64) conv_backward_cuda_setting2_mode1_bf16bf16f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A,
+    __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map,
+    int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C) {
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[2560];
+    __shared__ __nv_bfloat16 B_shared[4608];
+    __nv_bfloat16            A_shared_warp[16];
+    __nv_bfloat16            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int            j_factors1 = N / 16 / 4;
+    int            blockIdx_x = 0;
+    int            blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int            blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    __nv_bfloat16 *cur_C      = C + blockIdx_z * kernel_volume * N * K_original;
+    int *out_in_map_ptr       = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    __nv_bfloat16 *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                           ((threadIdx.x * 8 % 32) % K_original) +
+                           ((blockIdx_y / j_factors1 * 32) % K_original);
+    __nv_bfloat16 *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int            reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    __nv_bfloat16 *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                           blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                           (threadIdx.y % 1) * 2 * N / 16 * 256 +
+                           (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 +
+                           threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 + (threadIdx.x / 4) * N;
+    int  K_iters          = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int  kernel_offset    = (blockIdx_y / j_factors1) / (K_original / 32);
+    int  split_mask_iter  = kernel_offset / split_mask_len;
+    int *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int  bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool           bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_loc_ptr[reorder_offset_inner];
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) = *(uint4 *)(B_ptr + v0 * N);
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
-        }
-      }
-    }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    __nv_bfloat16* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4*)(B_ptr + v0 * N);
-        }
-        else
-        {
-          *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
-        }
-      }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+        }
+    }
+
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        __nv_bfloat16 *A_ptr_local          = A_ptr;
+        int            reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool           bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)),
+                               __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                __float2bfloat16_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_loc_ptr[reorder_offset_inner];
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    *(uint4 *)(B_ptr + v0 * N);
+            } else {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("BF16 kernels will not be compiled for SM75-.")
+#pragma message("BF16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("BF16 kernels will not be compiled for SM80-.")
+#pragma message("BF16 kernels will not be compiled for SM80-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
 
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        __nv_bfloat16 *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
 
-    __nv_bfloat16* C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
-
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
-      };
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] =
+                    __float2bfloat16(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
-
 // conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_f16f16f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode1_f16f16f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C)
-{
-  float C_warp[32];
-  __shared__ half A_shared[2560];
-  __shared__ half B_shared[4608];
-  half A_shared_warp[16];
-  half B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
-    }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  half *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 8
-      + threadIdx.x / 4
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original
-    + (blockIdx_y / j_factors1 * 32) / K_original;
-  half* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original)
-    + ((blockIdx_y / j_factors1 * 32) % K_original);
-  half* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64
-    + threadIdx.x * 8 / 64;
-  half* C_ptr = cur_C
-    + blockIdx_x / 1 * 108 * N / 16 * 256
-    + blockIdx_y / j_factors1 * 2 * N / 16 * 256
-    + (threadIdx.y % 1) * 2 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 1 * 32
-    + (threadIdx.x % 4) * 2
-    + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4 *)(B_ptr + v0 * N);
-    }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+__global__ void
+__launch_bounds__(64) conv_backward_cuda_setting2_mode1_f16f16f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    half *__restrict__ C) {
+    float           C_warp[32];
+    __shared__ half A_shared[2560];
+    __shared__ half B_shared[4608];
+    half            A_shared_warp[16];
+    half            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int   j_factors1     = N / 16 / 4;
+    int   blockIdx_x     = 0;
+    int   blockIdx_y     = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int   blockIdx_z     = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    half *cur_C          = C + blockIdx_z * kernel_volume * N * K_original;
+    int  *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    half *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                  ((threadIdx.x * 8 % 32) % K_original) +
+                  ((blockIdx_y / j_factors1 * 32) % K_original);
+    half *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int   reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    half *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                  blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                  (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 +
+                  (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 +
+                  (threadIdx.x / 4) * N;
+    int  K_iters          = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int  kernel_offset    = (blockIdx_y / j_factors1) / (K_original / 32);
+    int  split_mask_iter  = kernel_offset / split_mask_len;
+    int *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int  bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool  bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_loc_ptr[reorder_offset_inner];
+            *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                    ((((int)threadIdx.x) >> 3) * 72)) +
+                                   ((((int)threadIdx.x) & 7) * 8))) = *(uint4 *)(B_ptr + v0 * N);
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-    }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    half* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(uint4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(uint4*)(B_ptr + v0 * N);
-        }
-        else
-        {
-          *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
-        }
-      }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+        }
+    }
+
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        half *A_ptr_local          = A_ptr;
+        int   reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool  bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    *(uint4 *)(A_ptr_local + input_idx * K_original +
+                               ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                        ((((int)threadIdx.x) >> 2) * 40)) +
+                                       ((((int)threadIdx.x) & 3) * 8))) =
+                    make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)),
+                               __pack_half2(__float2half_rn(0.000000e+00f),
+                                            __float2half_rn(0.000000e+00f)));
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_loc_ptr[reorder_offset_inner];
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    *(uint4 *)(B_ptr + v0 * N);
+            } else {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(A_shared[((i2_0_1 * 640) + (ax1_0 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax1_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + (((int)threadIdx.y) * 32)) +
+                                                    (ax1_0_1 * 16))])) +
+                                       (((((int)threadIdx.x) & 15) * 72) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + (ax1_0_1 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
+                }
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              :  "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
-
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
 
-    half* C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        half *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
 
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
-      };
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] =
+                    __float2half(C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
-
 // conv_backward_cuda_m16n16k64_m16n16k64_m16n16k16_tf32tf32f32
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode1_tf32tf32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_warp[8];
-  __shared__ float A_shared[2560];
-  __shared__ float B_shared[2560];
-  float A_shared_warp[8];
-  float B_shared_warp[8];
-  for (int i = 0; i < 8; ++i)
-  {
-    C_warp[0 + i] = 0.0;
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  float *cur_C = C + blockIdx_z * kernel_volume * K_original * N;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 16
-      + threadIdx.x / 2
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded
-    + (blockIdx_y / j_factors1 * 16) / K_tile_padded;
-  float* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded)
-    + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-  float* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + (threadIdx.x * 8) % 16;
-  int reorder_offset = threadIdx.y * 256 / 16
-    + threadIdx.x * 8 / 16;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-  int cur_C_ic_start = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
-  int cur_C_oc_start = (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
-  float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check)
-  {
-    int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded) + ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
-    int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    // load twice
-    A_pred_guard = 3;
-  if constexpr (N_ld_check)
-  {
-    int B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    // load twice
-    B_pred_guard = 3;
-
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
-
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
-
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-      global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
-      global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-      *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
-    }
-
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int local_size = 0; local_size < 8; ++local_size)
-      {
-        A_shared_warp[local_size] = A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
+__global__ void
+__launch_bounds__(32) conv_backward_cuda_setting1_mode1_tf32tf32f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float            C_warp[8];
+    __shared__ float A_shared[2560];
+    __shared__ float B_shared[2560];
+    float            A_shared_warp[8];
+    float            B_shared_warp[8];
+    for (int i = 0; i < 8; ++i) {
+        C_warp[0 + i] = 0.0;
+    }
+
+    // hoisting shared pointer offsets
+    int    j_factors1     = (N + 15) / 16 / 1;
+    int    blockIdx_x     = 0;
+    int    blockIdx_y     = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int    blockIdx_z     = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    float *cur_C          = C + blockIdx_z * kernel_volume * K_original * N;
+    int   *out_in_map_ptr = out_in_map + (threadIdx.y * 16 + threadIdx.x / 2) * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded +
+                          (blockIdx_y / j_factors1 * 16) / K_tile_padded;
+    float *A_ptr = A + ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                   ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                   ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+    float *B_ptr            = B + (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+    int    reorder_offset   = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int    K_iters          = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int    kernel_offset    = (blockIdx_y / j_factors1) / ((K_original + K_tile - 1) / K_tile);
+    int    split_mask_iter  = kernel_offset / split_mask_len;
+    int   *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int   *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int    bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+    int    cur_C_ic_start   = (blockIdx_y / j_factors1 * 16) % K_tile_padded + (threadIdx.x / 4);
+    int    cur_C_oc_start =
+        (blockIdx_y % j_factors1) * 16 + threadIdx.y / 1 * 16 + (threadIdx.x % 4) * 2;
+    float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) {
+        int A_ld_start = ((threadIdx.y * 256 % 16) % K_tile_padded) +
+                         ((threadIdx.x * 8 % 16) % K_tile_padded) +
+                         ((blockIdx_y / j_factors1 * 16) % K_tile_padded);
+        int A_ld_amount = min(A_ld_start + 8, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 4);
+
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        // load twice
+        A_pred_guard = 3;
+    if constexpr (N_ld_check) {
+        int B_ld_start  = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        int B_ld_amount = min(B_ld_start + 8, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 4);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        // load twice
+        B_pred_guard = 3;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool   bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<K_ld_factor>(A_loaded[0],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                global_load<K_ld_factor>(A_loaded[1],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4,
+                                         A_pred_guard >> (4 * 4 / K_ld_factor));
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(A_loaded);
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int   reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            int   v0                   = reorder_loc_ptr[reorder_offset_inner];
+            uint4 B_loaded[2]          = { make_uint4(0, 0, 0, 0) };
+            global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
+            global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4,
+                                     B_pred_guard >> (4 * 4 / N_ld_factor));
+            *(ulonglong4 *)(B_shared +
+                            (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                             ((((int)threadIdx.x) & 1) * 8))) =
+                *reinterpret_cast<ulonglong4 *>(B_loaded);
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int local_size = 0; local_size < 8; ++local_size) {
+                A_shared_warp[local_size] =
+                    A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size & 1) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+        }
     }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
 
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded
-      ];
-
-      if (input_idx != -1)
-      {
-        uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded), A_pred_guard);
-        global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool   bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 256 % 16) / K_tile_padded];
+
+            if (input_idx != -1) {
+                uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<K_ld_factor>(A_loaded[0],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded),
+                                         A_pred_guard);
+                global_load<K_ld_factor>(A_loaded[1],
+                                         A_ptr_local + input_idx * K_original +
+                                             ((ax0_ax1_fused_0 * 256 % 16) % K_tile_padded) + 4,
+                                         A_pred_guard >> (4 * 4 / K_ld_factor));
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(A_loaded);
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                (((ax0_ax1_fused_0 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
+            if (reorder_offset_inner < M_fwd) {
+                int   v0          = reorder_loc_ptr[reorder_offset_inner];
+                uint4 B_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
+                global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4,
+                                         B_pred_guard >> (4 * 4 / N_ld_factor));
+                *(ulonglong4 *)(B_shared +
+                                (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(B_loaded);
+            } else {
+                *(ulonglong4 *)(B_shared +
+                                (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int local_size = 0; local_size < 8; ++local_size) {
+                A_shared_warp[local_size] =
+                    A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size & 1) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+#if __CUDA_ARCH__ >= 800
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 0))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 0))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]),
+                      "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]),
+                      "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]),
+                      "f"(((float *)(C_warp + 0))[3]));
+            }
+
+            {
+                __asm__ __volatile__(
+                    "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                    "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                    : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]),
+                      "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
+                    : "r"(((unsigned *)(A_shared_warp + 4))[0]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[1]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[2]),
+                      "r"(((unsigned *)(A_shared_warp + 4))[3]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                      "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]),
+                      "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]),
+                      "f"(((float *)(C_warp + 4))[3]));
+            }
+#else
+#pragma message("TF32 kernels will not be compiled.")
+#endif
+        }
     }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-    {
 
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 16;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<N_ld_factor>(B_loaded[0], B_ptr + v0 * N, B_pred_guard);
-        global_load<N_ld_factor>(B_loaded[1], B_ptr + v0 * N + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-        *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
-      }
-      else
-      {
-        *(ulonglong4 *)(B_shared + (((ax0_ax1_fused_0_1 * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0, 0, 0, 0);
-      }
+    for (int local_id = 0; local_id < 8; ++local_id) {
+        if constexpr (K_ld_check || N_ld_check) {
+            if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original &&
+                cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
+                C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                    C_warp[0 + local_id];
+        } else {
+            C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] =
+                C_warp[0 + local_id];
+        }
     }
+}
 
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int local_size = 0; local_size < 8; ++local_size)
-      {
-        A_shared_warp[local_size] = A_shared[(((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
+// conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_tf32tf32f32
+__global__ void
+__launch_bounds__(64) conv_backward_cuda_setting2_mode1_tf32tf32f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    float            C_warp[32];
+    __shared__ float A_shared[2560];
+    __shared__ float B_shared[4608];
+    float            A_shared_warp[16];
+    float            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
+
+    // hoisting shared pointer offsets
+    int    j_factors1     = N / 16 / 4;
+    int    blockIdx_x     = 0;
+    int    blockIdx_y     = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int    blockIdx_z     = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    float *cur_C          = C + blockIdx_z * kernel_volume * N * K_original;
+    int   *out_in_map_ptr = out_in_map + (threadIdx.y * 8 + threadIdx.x / 4) * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original +
+                          (blockIdx_y / j_factors1 * 32) / K_original;
+    float *A_ptr = A + ((threadIdx.y * 256 % 32) % K_original) +
+                   ((threadIdx.x * 8 % 32) % K_original) +
+                   ((blockIdx_y / j_factors1 * 32) % K_original);
+    float *B_ptr          = B + (blockIdx_y % j_factors1) * 64 + (threadIdx.x * 8) % 64;
+    int    reorder_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    float *C_ptr          = cur_C + blockIdx_x / 1 * 108 * N / 16 * 256 +
+                   blockIdx_y / j_factors1 * 2 * N / 16 * 256 +
+                   (threadIdx.y % 1) * 2 * N / 16 * 256 + (blockIdx_x % 1) * j_factors1 * 64 +
+                   (blockIdx_y % j_factors1) * 64 + threadIdx.y / 1 * 32 + (threadIdx.x % 4) * 2 +
+                   (threadIdx.x / 4) * N;
+    int  K_iters          = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+    int  kernel_offset    = (blockIdx_y / j_factors1) / (K_original / 32);
+    int  split_mask_iter  = kernel_offset / split_mask_len;
+    int *reorder_loc_ptr  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int *reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
+    int  bitmask_shift    = kernel_offset - split_mask_iter * split_mask_len;
+
+    for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool   bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            int v0                   = reorder_loc_ptr[reorder_offset_inner];
+            *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                         ((((int)threadIdx.x) >> 3) * 72)) +
+                                        ((((int)threadIdx.x) & 7) * 8))) =
+                *(ulonglong4 *)(B_ptr + v0 * N);
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax1_0 * 8) + local_size)] =
+                        A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                     ((((int)threadIdx.x) & 3) * 40)) +
+                                    (ax1_0 * 16)) +
+                                   ((local_size & 1) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[((ax1_0_1 * 8) + local_size_1)] =
+                        B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                      ((((int)threadIdx.x) & 3) * 72)) +
+                                     (((int)threadIdx.y) * 32)) +
+                                    (ax1_0_1 * 16)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
+#if __CUDA_ARCH__ >= 800
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+#else
+#pragma message("TF32 kernels will not be compiled.")
+#endif
+                }
+            }
+        }
+    }
+
+    for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0) {
+        int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
+        if (i2_0_0 >= (M_fwd + 63) / 64)
+            continue;
+
+        int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
+        float *A_ptr_local          = A_ptr;
+        int    reorder_offset_local = reorder_offset + i2_0_0 * 64;
+        bool   bit_flag             = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+            // related to input
+            // Haotian: NOTE: what if j_factors[0] != 1?
+            long input_idx = out_in_map_ptr_local[ax0_ax1_fused_0 * 16 * kernel_volume +
+                                                  (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+            if (input_idx != -1) {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                    ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+            } else {
+                *(ulonglong4 *)(A_shared +
+                                ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                  ((((int)threadIdx.x) >> 2) * 40)) +
+                                 ((((int)threadIdx.x) & 3) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_loc_ptr[reorder_offset_inner];
+                *(ulonglong4 *)(B_shared +
+                                ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                  ((((int)threadIdx.x) >> 3) * 72)) +
+                                 ((((int)threadIdx.x) & 7) * 8))) = *(ulonglong4 *)(B_ptr + v0 * N);
+            } else {
+                *(ulonglong4 *)(B_shared +
+                                ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) +
+                                  ((((int)threadIdx.x) >> 3) * 72)) +
+                                 ((((int)threadIdx.x) & 7) * 8))) =
+                    make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            }
+        }
+
+        __syncthreads();
+        for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1) {
+            for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax1_0 * 8) + local_size)] =
+                        A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) +
+                                     ((((int)threadIdx.x) & 3) * 40)) +
+                                    (ax1_0 * 16)) +
+                                   ((local_size & 1) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+            for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[((ax1_0_1 * 8) + local_size_1)] =
+                        B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                      ((((int)threadIdx.x) & 3) * 72)) +
+                                     (((int)threadIdx.y) * 32)) +
+                                    (ax1_0_1 * 16)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+            }
+
+            for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3) {
+                for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 0))[0]), "r"(((unsigned *)(A_shared_warp + 0))[1]), "r"(((unsigned *)(A_shared_warp + 0))[2]), "r"(((unsigned *)(A_shared_warp + 0))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 0))[0]), "=f"(((float *)(C_warp + 0))[1]), "=f"(((float *)(C_warp + 0))[2]), "=f"(((float *)(C_warp + 0))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + 0))[0]), "f"(((float *)(C_warp + 0))[1]), "f"(((float *)(C_warp + 0))[2]), "f"(((float *)(C_warp + 0))[3]));
-      }
-
-      {
-        __asm__ __volatile__(
-            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-            : "=f"(((float *)(C_warp + 4))[0]), "=f"(((float *)(C_warp + 4))[1]), "=f"(((float *)(C_warp + 4))[2]), "=f"(((float *)(C_warp + 4))[3])
-            : "r"(((unsigned *)(A_shared_warp + 4))[0]), "r"(((unsigned *)(A_shared_warp + 4))[1]), "r"(((unsigned *)(A_shared_warp + 4))[2]), "r"(((unsigned *)(A_shared_warp + 4))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + 4))[0]), "f"(((float *)(C_warp + 4))[1]), "f"(((float *)(C_warp + 4))[2]), "f"(((float *)(C_warp + 4))[3]));
-      }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                              "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                              "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                    }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+                }
+            }
+        }
     }
-  }
 
-  for (int local_id = 0; local_id < 8; ++local_id)
-  {
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if (cur_C_ic_start + ((local_id / 2) % 2) * 8 < K_original && cur_C_oc_start + (local_id % 2) + (local_id / 4) * 8 < N)
-        C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[0 + local_id];
-    }
-    else
-    {
-      C_ptr[+(((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[0 + local_id];
+    for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0) {
+        float *C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
+
+        for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) +
+                            (local_id / 4) * 8] = C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id];
+            };
+        }
     }
-  }
 }
 
+// conv_backward_cuda_m16n16k64_f32f32f32_sort
+template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
+__global__ void
+__launch_bounds__(32) conv_backward_cuda_setting1_mode1_f32f32f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int j_factors1 = (N + 15) / 16;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+
+    float            C_local[8];
+    __shared__ float A_shared[1024];
+    __shared__ float B_shared[1024];
 
-// conv_backward_cuda_m32n64k64_m32n32k64_m16n16k16_tf32tf32f32
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode1_tf32tf32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-  float C_warp[32];
-  __shared__ float A_shared[2560];
-  __shared__ float B_shared[4608];
-  float A_shared_warp[16];
-  float B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 2; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
+#pragma unroll
+    for (int i = 0; i < 8; ++i) {
+        C_local[i] = 0.0;
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  float *cur_C = C + blockIdx_z * kernel_volume * N * K_original;
-  int* out_in_map_ptr = out_in_map
-      + (threadIdx.y * 8
-      + threadIdx.x / 4
-    ) * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original
-    + (blockIdx_y / j_factors1 * 32) / K_original;
-  float* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original)
-    + ((blockIdx_y / j_factors1 * 32) % K_original);
-  float* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + (threadIdx.x * 8) % 64;
-  int reorder_offset = threadIdx.y * 256 / 64
-    + threadIdx.x * 8 / 64;
-  float* C_ptr = cur_C
-    + blockIdx_x / 1 * 108 * N / 16 * 256
-    + blockIdx_y / j_factors1 * 2 * N / 16 * 256
-    + (threadIdx.y % 1) * 2 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 1 * 32
-    + (threadIdx.x % 4) * 2
-    + (threadIdx.x / 4) * N;
-  int K_iters = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
-  int kernel_offset = (blockIdx_y / j_factors1) / (K_original / 32);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_ptr = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_ptr = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-
-  for (int _i2_0_0 = 0; _i2_0_0 < K_iters - 1; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * _i2_0_0;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-    {
 
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(ulonglong4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
 
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      int v0 = reorder_loc_ptr[reorder_offset_inner];
-      *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(ulonglong4 *)(B_ptr + v0 * N);
-    }
+    int  kernel_offset      = blockIdx_m / (K_tile_padded / 16);
+    int  split_mask_iter    = kernel_offset / split_mask_len;
+    int *reorder_loc_local  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int *reduced_mask_local = reduced_mask + split_mask_iter * reduced_mask_len;
+    int  bitmask_shift      = kernel_offset - split_mask_iter * split_mask_len;
 
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax1_0 * 8) + local_size)] = A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + (ax1_0 * 16)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[((ax1_0_1 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
-#if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-#else
-  #pragma message("TF32 kernels will not be compiled.")
-#endif
-        }
-      }
-    }
-  }
-
-  for (int _i2_0_0 = K_iters - 1; _i2_0_0 < K_iters; ++_i2_0_0)
-  {
-    int i2_0_0 = blockIdx_z + split_k_iters * (K_iters - 1);
-    if (i2_0_0 >= (M_fwd + 63) / 64)
-      continue;
-
-    int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 64 * kernel_volume;
-    float* A_ptr_local = A_ptr;
-    int reorder_offset_local = reorder_offset + i2_0_0 * 64;
-    bool bit_flag = (bool)(reduced_mask_ptr[i2_0_0] & (1 << bitmask_shift));
-    if (!bit_flag) continue;
-
-    __syncthreads();
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
+    int channel_offset = (blockIdx_m * 16 + ((threadIdx_x * 4) % 16)) % K_tile_padded;
+    int K_loops        = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
+
+    // hoisting shared pointer offsets
+    int   *out_in_map_ptr = out_in_map + (threadIdx_x / (16 / 4)) * kernel_volume + kernel_offset;
+    float *A_ptr          = A + channel_offset;
+
+    // reorder is performed on B's rows.
+    float *B_ptr          = B + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+    int    reorder_offset = threadIdx_x / (16 / 4);
+
+    float *A_shared_ptr = A_shared + (threadIdx_x * 4);
+    float *B_shared_ptr = B_shared + (threadIdx_x * 4);
+
+    float *A_shared_reduce_ptr = A_shared + (threadIdx_x / 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
+
+    // splitK offset
+    float *cur_C          = C + blockIdx_z * K_original * kernel_volume * N;
+    int    cur_C_ic_start = (blockIdx_m * 16 + (threadIdx_x / 4)) % K_tile_padded;
+    int    cur_C_oc_start = blockIdx_n * 16 + (threadIdx_x % 4);
+    float *C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
+
+    int A_pred_guard = 0;
+    int B_pred_guard = 0;
+    if constexpr (K_ld_check) // IC % cta_M != 0
     {
+        int A_ld_start  = channel_offset;
+        int A_ld_amount = min(A_ld_start + 4, K_original) - A_ld_start;
+        int A_ld_bound  = A_ld_amount / (K_ld_factor / 4);
 
-      // related to input
-      // Haotian: NOTE: what if j_factors[0] != 1?
-      long input_idx = out_in_map_ptr_local[
-        ax0_ax1_fused_0 * 16 * kernel_volume
-        + (ax0_ax1_fused_0 * 512 % 32) / K_original
-      ];
-
-      if (input_idx != -1)
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-            *(ulonglong4*)(A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 32) % K_original));
-      }
-      else
-      {
-        *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-      }
-    }
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
+        for (int i = 0; i < A_ld_bound; i++)
+            A_pred_guard |= (1 << i);
+    } else
+        A_pred_guard = 1;
+
+    if constexpr (N_ld_check) // OC % cta_N != 0
     {
+        int B_ld_start  = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+        int B_ld_amount = min(B_ld_start + 4, N) - B_ld_start;
+        int B_ld_bound  = B_ld_amount / (N_ld_factor / 4);
+
+        for (int i = 0; i < B_ld_bound; i++)
+            B_pred_guard |= (1 << i);
+    } else
+        B_pred_guard = 1;
 
-      int reorder_offset_inner = reorder_offset_local + ax0_ax1_fused_0_1 * 8;
-      if (reorder_offset_inner < M_fwd){
-        int v0 = reorder_loc_ptr[reorder_offset_inner];
-        *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          *(ulonglong4*)(B_ptr + v0 * N);
+#pragma unroll
+    for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0) {
+        int  k_0                  = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 8) * kernel_volume];
+            if (input_idx != -1) {
+                // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 *
+                // elements loaded in each loop
+                //     *(float4*)(A_ptr + (input_idx * K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original), A_pred_guard);
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
+            }
         }
-        else
-        {
-          *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 576) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
+            int v0                   = reorder_loc_local[reorder_offset_inner];
+            //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
+            //    *(float4*)(B_ptr + v0 * N);
+            uint4 B_loaded = make_uint4(0, 0, 0, 0);
+            global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+            *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
         }
-      }
 
-    __syncthreads();
-    for (int i2_0_1 = 0; i2_0_1 < 4; ++i2_0_1)
-    {
-      for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax1_0 * 8) + local_size)] = A_shared[((((((i2_0_1 * 640) + ((local_size >> 1) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + (ax1_0 * 16)) + ((local_size & 1) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-      for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-      {
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[((ax1_0_1 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + (((int)threadIdx.y) * 32)) + (ax1_0_1 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-      }
-
-      for (int i0_0_3 = 0; i0_0_3 < 2; ++i0_0_3)
-      {
-        for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-        {
-#if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-          }
-#else
-  #pragma message("TF32 kernels will not be compiled.")
-#endif
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 8; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                }
+            }
         }
-      }
     }
-  }
+    for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0) {
+        int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        if (k_0 >= (M_fwd + 63) / 64)
+            break;
 
-  for (int ax0_0 = 0; ax0_0 < 2; ++ax0_0)
-  {
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
 
-    float* C_ptr_local = C_ptr + ax0_0 * N / 16 * 256;
+        bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
 
-    for (int ax1_0_2 = 0; ax1_0_2 < 2; ++ax1_0_2)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 8) * kernel_volume);
+            if (input_idx != -1) {
+                // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 *
+                // elements loaded in each loop
+                //     *(float4*)(A_ptr + (input_idx * K_original));
+                uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original), A_pred_guard);
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
+            } else {
+                *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
+            }
+        }
 
-        C_ptr_local[ax1_0_2 * 16 + (((local_id / 2) % 2) * 8) * N + (local_id % 2) + (local_id / 4) * 8] = C_warp[((ax0_0 * 16) + (ax1_0_2 * 8)) + local_id];
-      };
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_loc_local[reorder_offset_inner];
+                //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
+                //    *(float4*)(B_ptr + v0 * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
+            } else {
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = make_uint4(0, 0, 0, 0);
+            }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 8; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                }
+            }
+        }
     }
-  }
-}
 
+#pragma unroll
+    for (int i = 0; i < 8; ++i) {
+        int local_row = ((i / 4) * 8);
+        int local_col = ((i % 4) * 4);
+        if constexpr (K_ld_check || N_ld_check) {
+            if (((cur_C_ic_start + local_row) < K_original) && ((cur_C_oc_start + local_col) < N))
+                C_ptr[local_row * N + local_col] = C_local[i];
 
-// conv_backward_cuda_m16n16k64_f32f32f32_sort
-template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(32) conv_backward_cuda_setting1_mode1_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-
-  int j_factors1 = (N + 15) / 16;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original + 15) / 16 * kernel_volume * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original + 15) / 16 * kernel_volume * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-
-  float C_local[8];
-  __shared__ float A_shared[1024];
-  __shared__ float B_shared[1024];
-
-  #pragma unroll
-  for (int i = 0; i < 8; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  int kernel_offset = blockIdx_m / (K_tile_padded / 16);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_local = reorder_loc + split_mask_iter * reorder_loc_len ;
-  int* reduced_mask_local = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-
-  int channel_offset = (blockIdx_m * 16 + ((threadIdx_x * 4) % 16)) % K_tile_padded;
-  int K_loops = ((M_fwd + 63 ) / 64 + split_k_iters - 1) / split_k_iters;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                          + (threadIdx_x / (16/4)) * kernel_volume
-                          + kernel_offset;
-  float * A_ptr = A + channel_offset;
-
-  // reorder is performed on B's rows.
-  float * B_ptr = B
-                    + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-  int reorder_offset = threadIdx_x /(16/4);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-
-  float * A_shared_reduce_ptr =  A_shared + (threadIdx_x / 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // splitK offset
-  float * cur_C = C + blockIdx_z * K_original * kernel_volume * N;
-  int cur_C_ic_start = (blockIdx_m * 16 + (threadIdx_x / 4)) % K_tile_padded;
-  int cur_C_oc_start = blockIdx_n * 16 + (threadIdx_x % 4);
-  float * C_ptr = cur_C + (kernel_offset * K_original + cur_C_ic_start) * N + cur_C_oc_start;
-
-  int A_pred_guard = 0;
-  int B_pred_guard = 0;
-  if constexpr (K_ld_check) // IC % cta_M != 0
-  {
-    int A_ld_start = channel_offset;
-    int A_ld_amount = min(A_ld_start + 4, K_original) - A_ld_start;
-    int A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-
-    for (int i = 0; i < A_ld_bound; i++)
-      A_pred_guard |= (1 << i);
-  }
-  else
-    A_pred_guard = 1;
-
-  if constexpr (N_ld_check) // OC % cta_N != 0
-  {
-    int B_ld_start = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-    int B_ld_amount = min(B_ld_start + 4, N) - B_ld_start;
-    int B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-
-    for (int i = 0; i < B_ld_bound; i++)
-      B_pred_guard |= (1 << i);
-  }
-  else
-    B_pred_guard = 1;
-
-  #pragma unroll
-  for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
-    if (!bit_flag)
-      continue;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 *8) * kernel_volume];
-      if (input_idx != -1)
-      {
-        // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-        //     *(float4*)(A_ptr + (input_idx * K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original) , A_pred_guard);
-        *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
-      }
-      else
-      {
-        *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
-      }
+        } else {
+            C_ptr[local_row * N + local_col] = C_local[i];
+        }
     }
+}
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
-      int v0 = reorder_loc_local[reorder_offset_inner];
-      //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
-      //    *(float4*)(B_ptr + v0 * N);
-      uint4 B_loaded = make_uint4(0, 0, 0, 0);
-      global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-      *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
-    }
+// conv_backward_cuda_m32n64k64_f32f32f32_sort
+__global__ void
+__launch_bounds__(64) conv_backward_cuda_setting2_mode1_f32f32f32(
+    int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len,
+    int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int j_factors1 = (N + 63) / 64;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
+
+    float            C_local[32];
+    __shared__ float A_shared[2048];
+    __shared__ float B_shared[4096];
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 8; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)]
-                          * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-        }
-
-      }
-    }
-  }
-  for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    if (k_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
-    if (!bit_flag)
-      continue;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *8) * kernel_volume);
-      if (input_idx != -1)
-      {
-        // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-        //     *(float4*)(A_ptr + (input_idx * K_original));
-        uint4 A_loaded = make_uint4(0, 0, 0, 0);
-        global_load<K_ld_factor>(A_loaded, A_ptr + (input_idx * K_original) , A_pred_guard);
-        *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = A_loaded;
-      }
-      else
-      {
-        *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 128)) = make_uint4(0, 0, 0, 0);
-      }
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 8; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 8);
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_loc_local[reorder_offset_inner];
-        //*(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) =
-        //    *(float4*)(B_ptr + v0 * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr + v0 * N, B_pred_guard);
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = B_loaded;
-      }
-      else
-      {
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 128)) = make_uint4(0, 0, 0, 0);
-      }
-    }
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 8; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 16) + ((i / 4) * 8)]
-                          * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-        }
-
-      }
-    }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 8; ++i)
-  {
-    int local_row = ((i / 4) * 8);
-    int local_col = ((i % 4) * 4);
-    if constexpr (K_ld_check || N_ld_check)
-    {
-      if ( ((cur_C_ic_start + local_row) < K_original) && ((cur_C_oc_start + local_col) < N) )
-        C_ptr[local_row * N + local_col] = C_local[i];
+    int  kernel_offset      = blockIdx_m / (K_original / 32);
+    int  split_mask_iter    = kernel_offset / split_mask_len;
+    int *reorder_loc_local  = reorder_loc + split_mask_iter * reorder_loc_len;
+    int *reduced_mask_local = reduced_mask + split_mask_iter * reduced_mask_len;
+    int  bitmask_shift      = kernel_offset - split_mask_iter * split_mask_len;
 
-    }
-    else
-    {
-      C_ptr[local_row * N + local_col] = C_local[i];
-    }
-  }
-}
+    int channel_offset = (blockIdx_m * 32 + ((threadIdx_x * 4) % 32)) % K_original;
+    int K_loops        = ((M_fwd + 63) / 64 + split_k_iters - 1) / split_k_iters;
 
+    // hoisting shared pointer offsets
+    int   *out_in_map_ptr = out_in_map + (threadIdx_x / (32 / 4)) * kernel_volume + kernel_offset;
+    float *A_ptr          = A + channel_offset;
 
-// conv_backward_cuda_m32n64k64_f32f32f32_sort
-__global__ void __launch_bounds__(64) conv_backward_cuda_setting2_mode1_f32f32f32(int M_fwd, int K_original, int N, int kernel_volume, int split_k_iters, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-
-  int j_factors1 = (N + 63) / 64;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((K_original * kernel_volume + 31) / 32 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((K_original * kernel_volume + 31) / 32 * j_factors1);
-
-  float C_local[32];
-  __shared__ float A_shared[2048];
-  __shared__ float B_shared[4096];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  int kernel_offset = blockIdx_m / (K_original / 32);
-  int split_mask_iter = kernel_offset / split_mask_len;
-  int* reorder_loc_local = reorder_loc + split_mask_iter * reorder_loc_len;
-  int* reduced_mask_local = reduced_mask + split_mask_iter * reduced_mask_len;
-  int bitmask_shift = kernel_offset - split_mask_iter * split_mask_len;
-
-  int channel_offset = (blockIdx_m * 32 + ((threadIdx_x * 4) % 32)) % K_original;
-  int K_loops = ((M_fwd + 63 ) / 64 + split_k_iters - 1) / split_k_iters;
-
-  // hoisting shared pointer offsets
-  int * out_in_map_ptr = out_in_map
-                          + (threadIdx_x / (32/4)) * kernel_volume
-                          + kernel_offset;
-  float * A_ptr = A + channel_offset;
-
-  // reorder is performed on B's rows.
-  float * B_ptr = B
-                    + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
-  int reorder_offset = threadIdx_x /(64/4);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-
-  float * A_shared_reduce_ptr =  A_shared + (threadIdx_x / 16);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
-
-  // splitK offset
-  float * cur_C = C + blockIdx_z * K_original * kernel_volume * N;
-  int C_m_offset = blockIdx_m * 32 + (threadIdx_x / 16);  // C_m_offset
-  int C_n_offset = blockIdx_n * 64  + (threadIdx_x % 16);
-  // float * C_ptr = cur_C + C_m_offset * N + C_n_offset;
-
-  #pragma unroll
-  for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
-    if (!bit_flag)
-      continue;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 *8) * kernel_volume];
-      if (input_idx != -1)
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-            *(float4*)(A_ptr + (input_idx * K_original));
-      }
-      else
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+    // reorder is performed on B's rows.
+    float *B_ptr          = B + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
+    int    reorder_offset = threadIdx_x / (64 / 4);
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
-      int v0 = reorder_loc_local[reorder_offset_inner];
-      *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
-          *(float4*)(B_ptr + v0 * N);
-    }
+    float *A_shared_ptr = A_shared + (threadIdx_x * 4);
+    float *B_shared_ptr = B_shared + (threadIdx_x * 4);
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 32; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)]
-                          * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-        }
-
-      }
-    }
-  }
-  for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0)
-  {
-    int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
-    if (k_0 >= (M_fwd + 63) / 64)
-      break;
-
-    int * out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
-    int reorder_offset_local = reorder_offset + k_0 * 64;
-
-    bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
-    if (!bit_flag)
-      continue;
-
-    __syncthreads();
-    #pragma unroll
-    for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-    {
-      long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *8) * kernel_volume);
-      if (input_idx != -1)
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-            *(float4*)(A_ptr + (input_idx * K_original));
-      }
-      else
-      {
-        *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+    float *A_shared_reduce_ptr = A_shared + (threadIdx_x / 16);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
 
-    #pragma unroll
-    for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1)
-    {
-      int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
-      if (reorder_offset_inner < M_fwd)
-      {
-        int v0 = reorder_loc_local[reorder_offset_inner];
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
-            *(float4*)(B_ptr + v0 * N);
-      }
-      else
-      {
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-      }
-    }
+    // splitK offset
+    float *cur_C      = C + blockIdx_z * K_original * kernel_volume * N;
+    int    C_m_offset = blockIdx_m * 32 + (threadIdx_x / 16); // C_m_offset
+    int    C_n_offset = blockIdx_n * 64 + (threadIdx_x % 16);
+    // float * C_ptr = cur_C + C_m_offset * N + C_n_offset;
 
-    __syncthreads();
-    #pragma unroll
-    for (int k_1 = 0; k_1 < ( 64 / 4); ++k_1)
-    {
-      #pragma unroll
-      for (int k_2 = 0; k_2 < 4; ++k_2)
-      {
-        int vk_in_block = (k_1 << 2) + k_2;
-        #pragma unroll
-        for (int i = 0; i < 32; ++i)
-        {
-          C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)]
-                          * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-        }
-
-      }
-    }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int C_m_offset_cur = C_m_offset + ((i / 4) * 4);
-      int C_n_offset_cur = C_n_offset + ((i % 4) * 16);
-      cur_C[C_m_offset_cur * N + C_n_offset_cur] = C_local[i];
-  }
-}
+#pragma unroll
+    for (int _k_0 = 0; _k_0 < K_loops - 1; ++_k_0) {
+        int  k_0                  = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
 
-template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMGradSorted<torch::kCUDA>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, torch::Tensor _reduced_mask,
-    torch::Tensor _reorder_loc, const int split_k_iters,
-    bool allow_tf32, bool allow_fp16)
-{
-  bool is_tf = allow_tf32;
-  int num_in_feats = _in_feats.size(0);
-  int num_in_channels = _in_feats.size(1);
-  int kernel_volume = _out_in_map.size(1);
-  int split_mask_num = _reorder_loc.size(0);
-  int split_mask_len = (kernel_volume + split_mask_num - 1) / split_mask_num;
-  int reduced_mask_len = _reduced_mask.size(1);
-  int reorder_loc_len = _reorder_loc.size(1);
-  auto options =
-      torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
-  at::Tensor _out_feats = torch::empty({split_k_iters, num_in_channels * kernel_volume, _kernel.size(1)}, options);
-  int num_out_feats = _out_feats.size(1);
-  int num_out_channels = _out_feats.size(2);
-  auto reduced_mask = _reduced_mask.data_ptr<int>();
-  auto out_in_map = _out_in_map.data_ptr<int>();
-  auto reorder_loc = _reorder_loc.data_ptr<int>();
-  bool is_half = _in_feats.scalar_type() == at::ScalarType::Half;
-  bool is_bfloat16 = _in_feats.scalar_type() == at::ScalarType::BFloat16;
-
-  if (is_half)
-  {
-    // throw std::runtime_error("FP16 kernels have not been updated for split mask implimentation.");
-    if (!allow_fp16)
-    {
-      throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
-    }
-    auto in_feats = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
-    auto kernel = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
-    auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+        bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
 
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-        _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f16f16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-    }
-  } 
-  else if (is_bfloat16)
-  {
-    // throw std::runtime_error("FP16 kernels have not been updated for split mask implimentation.");
-    //if (!allow_fp16)
-    //{
-    //  throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
-    //}
-    auto in_feats = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
-    auto kernel = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
-    auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-        _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-    }
-  }
-  else if (is_tf)
-  {
-    //throw std::runtime_error("TF32 kernels have not been updated for split mask implimentation.");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 64 / 1;
-      dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_backward_cuda_setting2_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-        _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else
-    {
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 * split_k_iters);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 1);
-      // conv_backward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 8) * kernel_volume];
+            if (input_idx != -1) {
+                *(float4 *)(A_shared_ptr +
+                            (ax0_ax1_fused_0 *
+                             256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                    *(float4 *)(A_ptr + (input_idx * K_original));
+            } else {
+                *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
+
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
+            int v0                   = reorder_loc_local[reorder_offset_inner];
+            *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = *(float4 *)(B_ptr + v0 * N);
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 32; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                }
+            }
+        }
     }
-  }
-  else // fp32fp32fp32
-  {
-    // printf("\nRun FP32 wgrad backward kernels!\n");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int block_num_M = (num_in_channels * kernel_volume) / 32;
-      int block_num_N = (num_out_channels) / 64; //j_factors1
+    for (int _k_0 = K_loops - 1; _k_0 < K_loops; ++_k_0) {
+        int k_0 = blockIdx_z + split_k_iters * _k_0; // splitK offset
+        if (k_0 >= (M_fwd + 63) / 64)
+            break;
 
-      dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
-      dim3 threads_per_block(64);
-      conv_backward_cuda_setting2_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
-          _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+        int *out_in_map_ptr_local = out_in_map_ptr + k_0 * 64 * kernel_volume;
+        int  reorder_offset_local = reorder_offset + k_0 * 64;
+
+        bool bit_flag = (bool)(reduced_mask_local[k_0] & (1 << bitmask_shift));
+        if (!bit_flag)
+            continue;
+
+        __syncthreads();
+#pragma unroll
+        for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+            long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 8) * kernel_volume);
+            if (input_idx != -1) {
+                *(float4 *)(A_shared_ptr +
+                            (ax0_ax1_fused_0 *
+                             256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                    *(float4 *)(A_ptr + (input_idx * K_original));
+            } else {
+                *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
+
+#pragma unroll
+        for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 16; ++ax0_ax1_fused_0_1) {
+            int reorder_offset_inner = reorder_offset_local + (ax0_ax1_fused_0_1 * 4);
+            if (reorder_offset_inner < M_fwd) {
+                int v0 = reorder_loc_local[reorder_offset_inner];
+                *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = *(float4 *)(B_ptr + v0 * N);
+            } else {
+                *(float4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =
+                    make_float4(0.0, 0.0, 0.0, 0.0);
+            }
+        }
+
+        __syncthreads();
+#pragma unroll
+        for (int k_1 = 0; k_1 < (64 / 4); ++k_1) {
+#pragma unroll
+            for (int k_2 = 0; k_2 < 4; ++k_2) {
+                int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                for (int i = 0; i < 32; ++i) {
+                    C_local[i] =
+                        C_local[i] + A_shared_reduce_ptr[(vk_in_block * 32) + ((i / 4) * 4)] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                }
+            }
+        }
     }
-    else
-    {
-      int block_num_M = (num_in_channels + 15) / 16 * kernel_volume;
-      int block_num_N = (num_out_channels - 1) / 16 + 1;
-
-      dim3 num_blocks(block_num_M * block_num_N  * split_k_iters);
-      dim3 threads_per_block(32);
-      // conv_backward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_backward_cuda_setting1_mode1_f32f32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
+
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int C_m_offset_cur                         = C_m_offset + ((i / 4) * 4);
+        int C_n_offset_cur                         = C_n_offset + ((i % 4) * 16);
+        cur_C[C_m_offset_cur * N + C_n_offset_cur] = C_local[i];
     }
-  }
-  return _out_feats.sum(0);
 }
 
+template <>
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMGradSorted<torch::kCUDA>(
+    torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _out_in_map,
+    torch::Tensor _reduced_mask, torch::Tensor _reorder_loc, const int split_k_iters,
+    bool allow_tf32, bool allow_fp16) {
+    bool       is_tf            = allow_tf32;
+    int        num_in_feats     = _in_feats.size(0);
+    int        num_in_channels  = _in_feats.size(1);
+    int        kernel_volume    = _out_in_map.size(1);
+    int        split_mask_num   = _reorder_loc.size(0);
+    int        split_mask_len   = (kernel_volume + split_mask_num - 1) / split_mask_num;
+    int        reduced_mask_len = _reduced_mask.size(1);
+    int        reorder_loc_len  = _reorder_loc.size(1);
+    auto       options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats =
+        torch::empty({ split_k_iters, num_in_channels * kernel_volume, _kernel.size(1) }, options);
+    int  num_out_feats    = _out_feats.size(1);
+    int  num_out_channels = _out_feats.size(2);
+    auto reduced_mask     = _reduced_mask.data_ptr<int>();
+    auto out_in_map       = _out_in_map.data_ptr<int>();
+    auto reorder_loc      = _reorder_loc.data_ptr<int>();
+    bool is_half          = _in_feats.scalar_type() == at::ScalarType::Half;
+    bool is_bfloat16      = _in_feats.scalar_type() == at::ScalarType::BFloat16;
+
+    if (is_half) {
+        // throw std::runtime_error("FP16 kernels have not been updated for split mask
+        // implimentation.");
+        if (!allow_fp16) {
+            throw std::runtime_error(
+                "FP16 kernels are not supported for implicit GEMM now for SM75-.");
+        }
+        auto in_feats  = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
+        auto kernel    = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
+        auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f16f16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
+        }
+    } else if (is_bfloat16) {
+        // throw std::runtime_error("FP16 kernels have not been updated for split mask
+        // implimentation.");
+        // if (!allow_fp16)
+        //{
+        //  throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for
+        //  SM75-.");
+        //}
+        auto in_feats  = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
+        auto kernel    = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
+        auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_bf16bf16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
+        }
+    } else if (is_tf) {
+        // throw std::runtime_error("TF32 kernels have not been updated for split mask
+        // implimentation.");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 64 / 1;
+            dim3 num_blocks(1 * num_in_channels * kernel_volume / 32 * j_factors1 * split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_backward_cuda_setting2_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_in_channels + 15) / 16 * kernel_volume * j_factors1 *
+                            split_k_iters);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 1);
+            // conv_backward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_tf32tf32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
+        }
+    } else // fp32fp32fp32
+    {
+        // printf("\nRun FP32 wgrad backward kernels!\n");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int block_num_M = (num_in_channels * kernel_volume) / 32;
+            int block_num_N = (num_out_channels) / 64; // j_factors1
+
+            dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
+            dim3 threads_per_block(64);
+            conv_backward_cuda_setting2_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
+                _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+                split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            int block_num_M = (num_in_channels + 15) / 16 * kernel_volume;
+            int block_num_N = (num_out_channels - 1) / 16 + 1;
+
+            dim3 num_blocks(block_num_M * block_num_N * split_k_iters);
+            dim3 threads_per_block(32);
+            // conv_backward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _kernel.size(0), num_in_channels, num_out_channels, kernel_volume, split_k_iters,
+            //     split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+            //     reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_backward_cuda_setting1_mode1_f32f32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            _kernel.size(0), num_in_channels, num_out_channels, kernel_volume,
+                            split_k_iters, split_mask_len, reduced_mask_len, reorder_loc_len,
+                            in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
+        }
+    }
+    return _out_feats.sum(0);
+}
 
 template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMGradSorted<torch::kCPU>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, torch::Tensor _reduced_mask,
-    torch::Tensor _reorder_loc, const int split_k_iters,
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMGradSorted<torch::kCPU>(
+    torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _out_in_map,
+    torch::Tensor _reduced_mask, torch::Tensor _reorder_loc, const int split_k_iters,
     bool allow_tf32, bool allow_fp16) {
     TORCH_CHECK(false, "No support for CPU-based ImplicitGEMM!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
index edea31f26f..5570fccc40 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionImplicitGEMMSorted.cu
@@ -1,3251 +1,3765 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
-#include <cuda_fp16.h>
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/backend/ConvOps.h"
+#include "ConvOps.h"
 
+#include <detail/ops/Ops.h>
+
+#include <cuda_fp16.h>
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <int bytes>
-struct global_load;
-
-template <>
-struct global_load<16>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint4 &data = *reinterpret_cast<uint4 *>(&D);
-    asm volatile(
-        "{\n"
-        "  .reg .pred p;\n"
-        "  setp.ne.b32 p, %5, 0;\n"
-        "  mov.b32 %0, %6;\n"
-        "  mov.b32 %1, %7;\n"
-        "  mov.b32 %2, %8;\n"
-        "  mov.b32 %3, %9;\n"
-        "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-        "}\n"
-        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
-        : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
-  }
+template <int bytes> struct global_load;
+
+template <> struct global_load<16> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint4 &data = *reinterpret_cast<uint4 *>(&D);
+        asm volatile("{\n"
+                     "  .reg .pred p;\n"
+                     "  setp.ne.b32 p, %5, 0;\n"
+                     "  mov.b32 %0, %6;\n"
+                     "  mov.b32 %1, %7;\n"
+                     "  mov.b32 %2, %8;\n"
+                     "  mov.b32 %3, %9;\n"
+                     "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+                     "}\n"
+                     : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                     : "l"(ptr), "r"((int)(pred_guard & 1)), "r"(data.x), "r"(data.y), "r"(data.z),
+                       "r"(data.w));
+    }
 };
 
-template <>
-struct global_load<8>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
+template <> struct global_load<8> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint2 const *ptr_ldg = reinterpret_cast<uint2 const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++)
-    {
-      uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %3, 0;\n"
-          "  mov.b32 %0, %4;\n"
-          "  mov.b32 %1, %5;\n"
-          "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
-          "}\n"
-          : "=r"(data.x), "=r"(data.y)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data.x), "r"(data.y));
+        for (int ldg_idx = 0; ldg_idx < 2; ldg_idx++) {
+            uint2 &data = *(reinterpret_cast<uint2 *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %3, 0;\n"
+                         "  mov.b32 %0, %4;\n"
+                         "  mov.b32 %1, %5;\n"
+                         "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+                         "}\n"
+                         : "=r"(data.x), "=r"(data.y)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data.x), "r"(data.y));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<4>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
+template <> struct global_load<4> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        unsigned const *ptr_ldg = reinterpret_cast<unsigned const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++)
-    {
-      unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b32 %0, %3;\n"
-          "  @p ld.global.u32 %0, [%1];\n"
-          "}\n"
-          : "=r"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "r"(data));
+        for (int ldg_idx = 0; ldg_idx < 4; ldg_idx++) {
+            unsigned &data = *(reinterpret_cast<unsigned *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b32 %0, %3;\n"
+                         "  @p ld.global.u32 %0, [%1];\n"
+                         "}\n"
+                         : "=r"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "r"(data));
+        }
     }
-  }
 };
 
-template <>
-struct global_load<2>
-{
-  __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard)
-  {
-    uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
+template <> struct global_load<2> {
+    __device__ __inline__ global_load(uint4 &D, void const *ptr, int pred_guard) {
+        uint16_t const *ptr_ldg = reinterpret_cast<uint16_t const *>(ptr);
 #pragma unroll
-    for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++)
-    {
-      uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
-      asm volatile(
-          "{\n"
-          "  .reg .pred p;\n"
-          "  setp.ne.b32 p, %2, 0;\n"
-          "  mov.b16 %0, %3;\n"
-          "  @p ld.global.u16 %0, [%1];\n"
-          "}\n"
-          : "=h"(data)
-          : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))), "h"(data));
+        for (int ldg_idx = 0; ldg_idx < 8; ldg_idx++) {
+            uint16_t &data = *(reinterpret_cast<uint16_t *>(&D) + ldg_idx);
+            asm volatile("{\n"
+                         "  .reg .pred p;\n"
+                         "  setp.ne.b32 p, %2, 0;\n"
+                         "  mov.b16 %0, %3;\n"
+                         "  @p ld.global.u16 %0, [%1];\n"
+                         "}\n"
+                         : "=h"(data)
+                         : "l"(ptr_ldg + ldg_idx), "r"((int)(pred_guard & (1 << ldg_idx))),
+                           "h"(data));
+        }
     }
-  }
 };
 
 // Pack two half values.
 static inline __device__ __host__ unsigned
-__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
 // Pack two half values.
 static inline __device__ __host__ unsigned
-__pack_half2(const half x, const half y)
-{
-  unsigned v0 = *((unsigned short *)&x);
-  unsigned v1 = *((unsigned short *)&y);
-  return (v1 << 16) | v0;
+__pack_half2(const half x, const half y) {
+    unsigned v0 = *((unsigned short *)&x);
+    unsigned v1 = *((unsigned short *)&y);
+    return (v1 << 16) | v0;
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_bf16bf16f32_sort
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode1_bf16bf16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[640];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 16
-      + threadIdx.x / 2;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  __nv_bfloat16* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  __nv_bfloat16* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  __nv_bfloat16* C_ptr = C + M * N * blockIdx_z
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
-
-  for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile; ++i2_0_0)
-
-  {
-
-    int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
-
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting1_mode1_bf16bf16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    __nv_bfloat16 *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[640];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
+    }
+
+    // hoisting shared pointer offsets
+    int  j_factors1        = (N + 15) / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded;
+    int           *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int           *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    __nv_bfloat16 *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                           threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C + M * N * blockIdx_z + (blockIdx_x % 1) * j_factors1 * 16 +
+                           (blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 +
+                           (threadIdx.x % 4) * 2;
+
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        B_ld_K_bound  = K_original;
+    } else
+        B_pred_guard = 1;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
+
+    for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile;
+         ++i2_0_0)
+
     {
+        int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            if constexpr (K_ld_check) {
+                A_ld_start =
+                    (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+                A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+                A_ld_bound   = A_ld_amount / (K_ld_factor / 2);
+                A_pred_guard = 0;
+                for (int i = 0; i < A_ld_bound; i++)
+                    A_pred_guard |= (1 << i);
+            } else {
+                A_pred_guard = 1;
+            }
 
-      if constexpr (K_ld_check)
-      {
-        A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-        A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-        A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-        A_pred_guard = 0;
-        for (int i = 0; i < A_ld_bound; i++)
-          A_pred_guard |= (1 << i);
-      }
-      else
-      {
-        A_pred_guard = 1;
-      }
-
-      if constexpr (K_ld_check || N_ld_check)
-      {
-        B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-        B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-        B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-        B_pred_guard = 0;
-        for (int i = 0; i < B_ld_bound; i++)
-          B_pred_guard |= (1 << i);
-      }
-
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
-      __nv_bfloat16* A_ptr_local = A_ptr + (i2_0_0 * 16 % K_tile_padded);
-      __nv_bfloat16* B_ptr_local;
-      if constexpr (K_ld_check)
-        B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-      else
-        B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded
-        ];
-
-        if (input_idx != -1)
-        {
-          uint4 A_loaded = make_uint4(0, 0, 0, 0);
-          global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-        }
-        else
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-        }
-      }
-
-      if (threadIdx.y == 0)
-      {
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
-        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-      }
-
-      __syncthreads();
-
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            if constexpr (K_ld_check || N_ld_check) {
+                B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+                B_ld_amount  = B_ld_amount_N * (int)B_ld_K;
+                B_ld_bound   = B_ld_amount / (N_ld_factor / 2);
+                B_pred_guard = 0;
+                for (int i = 0; i < B_ld_bound; i++)
+                    B_pred_guard |= (1 << i);
+            }
+
+            int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
+            __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * 16 % K_tile_padded);
+            __nv_bfloat16 *B_ptr_local;
+            if constexpr (K_ld_check)
+                B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                       i2_0_0 * K_tile % K_tile_padded) *
+                                          N;
+            else
+                B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+                if (input_idx != -1) {
+                    uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                    global_load<K_ld_factor>(A_loaded,
+                                             A_ptr_local + input_idx * K_original +
+                                                 ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                             A_pred_guard);
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                 ((((int)threadIdx.x) >> 1) * 40)) +
+                                ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+                } else {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                 ((((int)threadIdx.x) >> 1) * 40)) +
+                                ((((int)threadIdx.x) & 1) * 8))) =
+                        make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)));
+                }
+            }
+
+            if (threadIdx.y == 0) {
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            }
+
+            __syncthreads();
+
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"(
+                            (void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) +
+                                     (((((int)threadIdx.x) & 15) * 40) +
+                                      ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) +
+                                                       ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #elif __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
+        }
     }
-  }
-
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
 
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-      else
-      {
-        if (reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-    };
-  }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+            } else {
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+            }
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_f16f16f32_sort
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode1_f16f16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[640];
-  half A_shared_warp[32];
-  half B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 16
-      + threadIdx.x / 2;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  half* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  half* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  half* C_ptr = C + M * N * blockIdx_z
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
-
-  for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile; ++i2_0_0)
-
-  {
-
-    int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
-
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting1_mode1_f16f16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask,
+    int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float           C_warp[32];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[640];
+    half            A_shared_warp[32];
+    half            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
+    }
+
+    // hoisting shared pointer offsets
+    int  j_factors1        = (N + 15) / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded;
+    int  *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int  *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    half *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                  threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C + M * N * blockIdx_z + (blockIdx_x % 1) * j_factors1 * 16 +
+                  (blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        B_ld_K_bound  = K_original;
+    } else
+        B_pred_guard = 1;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
+
+    for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile;
+         ++i2_0_0)
+
     {
+        int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            if constexpr (K_ld_check) {
+                A_ld_start =
+                    (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+                A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+                A_ld_bound   = A_ld_amount / (K_ld_factor / 2);
+                A_pred_guard = 0;
+                for (int i = 0; i < A_ld_bound; i++)
+                    A_pred_guard |= (1 << i);
+            } else {
+                A_pred_guard = 1;
+            }
 
-      if constexpr (K_ld_check)
-      {
-        A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-        A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-        A_ld_bound = A_ld_amount / (K_ld_factor / 2);
-        A_pred_guard = 0;
-        for (int i = 0; i < A_ld_bound; i++)
-          A_pred_guard |= (1 << i);
-      }
-      else
-      {
-        A_pred_guard = 1;
-      }
-
-      if constexpr (K_ld_check || N_ld_check)
-      {
-        B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-        B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-        B_ld_bound = B_ld_amount / (N_ld_factor / 2);
-        B_pred_guard = 0;
-        for (int i = 0; i < B_ld_bound; i++)
-          B_pred_guard |= (1 << i);
-      }
-
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
-      half* A_ptr_local = A_ptr + (i2_0_0 * 16 % K_tile_padded);
-      half* B_ptr_local;
-      if constexpr (K_ld_check)
-        B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-      else
-        B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded
-        ];
-
-        if (input_idx != -1)
-        {
-          uint4 A_loaded = make_uint4(0, 0, 0, 0);
-          global_load<K_ld_factor>(A_loaded, A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
-        }
-        else
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-        }
-      }
-
-      if (threadIdx.y == 0)
-      {
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
-        *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
-      }
-
-      __syncthreads();
-
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            if constexpr (K_ld_check || N_ld_check) {
+                B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+                B_ld_amount  = B_ld_amount_N * (int)B_ld_K;
+                B_ld_bound   = B_ld_amount / (N_ld_factor / 2);
+                B_pred_guard = 0;
+                for (int i = 0; i < B_ld_bound; i++)
+                    B_pred_guard |= (1 << i);
+            }
+
+            int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
+            half *A_ptr_local          = A_ptr + (i2_0_0 * 16 % K_tile_padded);
+            half *B_ptr_local;
+            if constexpr (K_ld_check)
+                B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                       i2_0_0 * K_tile % K_tile_padded) *
+                                          N;
+            else
+                B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+                if (input_idx != -1) {
+                    uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                    global_load<K_ld_factor>(A_loaded,
+                                             A_ptr_local + input_idx * K_original +
+                                                 ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                             A_pred_guard);
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                 ((((int)threadIdx.x) >> 1) * 40)) +
+                                ((((int)threadIdx.x) & 1) * 8))) = A_loaded;
+                } else {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                 ((((int)threadIdx.x) >> 1) * 40)) +
+                                ((((int)threadIdx.x) & 1) * 8))) =
+                        make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)));
+                }
+            }
+
+            if (threadIdx.y == 0) {
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr_local, B_pred_guard);
+                *(uint4 *)(B_shared +
+                           (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                            ((((int)threadIdx.x) & 1) * 8))) = B_loaded;
+            }
+
+            __syncthreads();
+
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"(
+                            (void *)((&(A_shared[((((int)threadIdx.y) * 2560) + (ax0_0 * 640))])) +
+                                     (((((int)threadIdx.x) & 15) * 40) +
+                                      ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                           "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-      }
-
-      {
-        unsigned int addr;
-        __asm__ __volatile__(
-            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-            : "=r"(addr)
-            : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                }
+            }
+
+            {
+                unsigned int addr;
+                __asm__ __volatile__(
+                    "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                    : "=r"(addr)
+                    : "l"((void *)((&(B_shared[0])) + (((((int)threadIdx.x) & 15) * 40) +
+                                                       ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-        __asm__ __volatile__(
-            "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-            "{%0, %1, %2, %3}, [%4];"
-            : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-            : "r"(addr));
+                __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                     "{%0, %1, %2, %3}, [%4];"
+                                     : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                       "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                     : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
+            }
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #elif __CUDA_ARCH__ >= 750
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
-
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-              "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                           "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                         "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                         : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                         : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                           "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                           "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                           "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                        "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-      }
+            }
+        }
     }
-  }
-
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
 
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-      else
-      {
-        if (reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-      }
-    };
-  }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+            } else {
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+            }
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_bf16bf16f32_sort
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode1_bf16bf16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[1280];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
-    for (int i = 0; i < 8; ++i) {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting2_mode1_bf16bf16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    __nv_bfloat16 *__restrict__ C) {
+    int                      K_implicit = K_original * kernel_volume;
+    float                    C_warp[32];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[1280];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        }
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  __nv_bfloat16* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  __nv_bfloat16* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  __nv_bfloat16* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int           *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int           *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    __nv_bfloat16 *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                           threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C +
+                           M * N * blockIdx_z
+                           //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                           //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                           //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                           + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 +
+                           threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-  {
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
     {
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            __nv_bfloat16 *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 16) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                            ((((int)threadIdx.x) >> 2) * 40)) +
+                                           ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(uint4 *)(A_ptr_local + input_idx * K_original +
+                                   ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+                } else {
+                    *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                            ((((int)threadIdx.x) >> 2) * 40)) +
+                                           ((((int)threadIdx.x) & 3) * 8))) =
+                        make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)));
+                }
+            }
 
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      __nv_bfloat16* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      __nv_bfloat16* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 16) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(uint4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(uint4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 512 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(uint4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-        }
-      }
-
-      *(uint4*)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        *(uint4*)(B_ptr_local);
-
-
-      __syncthreads();
-
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = *(uint4 *)(B_ptr_local);
+
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(A_shared[(
+                                               (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                               (i2_0_1 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 40) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                    }
+                }
+
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
-
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                             "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                             : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                             : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                               "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                               "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                             "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                             : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                             : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                               "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                               "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
-    };
-  }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                      (local_id / 4) * 8] = __float2bfloat16(C_warp[(ax0_0_1 * 8) + local_id]);
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_f16f16f32_sort
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode1_f16f16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[1280];
-  half A_shared_warp[32];
-  half B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
-    for (int i = 0; i < 8; ++i) {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting2_mode1_f16f16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask,
+    int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C) {
+    int             K_implicit = K_original * kernel_volume;
+    float           C_warp[32];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[1280];
+    half            A_shared_warp[32];
+    half            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        }
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  half* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  half* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  half* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int  *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int  *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    half *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                  threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C +
+                  M * N * blockIdx_z
+                  //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                  //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                  //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                  + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 +
+                  threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-  {
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
     {
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            half *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            half *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 16) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                            ((((int)threadIdx.x) >> 2) * 40)) +
+                                           ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(uint4 *)(A_ptr_local + input_idx * K_original +
+                                   ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+                } else {
+                    *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                            ((((int)threadIdx.x) >> 2) * 40)) +
+                                           ((((int)threadIdx.x) & 3) * 8))) =
+                        make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)));
+                }
+            }
 
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      half* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      half* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 16) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(uint4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(uint4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 512 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(uint4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-        }
-      }
-
-      *(uint4*)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        *(uint4*)(B_ptr_local);
-
-
-      __syncthreads();
-
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+            *(uint4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                   ((((int)threadIdx.x) & 1) * 8))) = *(uint4 *)(B_ptr_local);
+
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(A_shared[(
+                                               (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                               (i2_0_1 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 40) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-
-        {
-          unsigned int addr;
-          __asm__ __volatile__(
-              "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-              : "=r"(addr)
-              : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+                    }
+                }
+
+                {
+                    unsigned int addr;
+                    __asm__ __volatile__(
+                        "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                        : "=r"(addr)
+                        : "l"((void *)((&(B_shared[(i2_0_1 * 640)])) +
+                                       (((((int)threadIdx.x) & 15) * 40) +
+                                        ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-          __asm__ __volatile__(
-              "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-              "{%0, %1, %2, %3}, [%4];"
-              : "=r"(((unsigned *)(B_shared_warp + 0))[0]), "=r"(((unsigned *)(B_shared_warp + 0))[1]), "=r"(((unsigned *)(B_shared_warp + 0))[2]), "=r"(((unsigned *)(B_shared_warp + 0))[3])
-              : "r"(addr));
+                    __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                         "{%0, %1, %2, %3}, [%4];"
+                                         : "=r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[1]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[2]),
+                                           "=r"(((unsigned *)(B_shared_warp + 0))[3])
+                                         : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
-
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
 #elif __CUDA_ARCH__ >= 750
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
+                    {
+                        __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                             "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                             : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                               "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                                             : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                               "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                               "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                                               "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__("mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                             "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                             : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                               "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                                             : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                               "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                               "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                               "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                            "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
+                }
+            }
         }
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
-    };
-  }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                      (local_id / 4) * 8] = __float2half(C_warp[(ax0_0_1 * 8) + local_id]);
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_bf16bf16f32_sort
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode1_bf16bf16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, __nv_bfloat16 *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ __nv_bfloat16 A_shared[5120];
-  __shared__ __nv_bfloat16 B_shared[2304];
-  __nv_bfloat16 A_shared_warp[32];
-  __nv_bfloat16 B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
+__global__ void
+__launch_bounds__(128) conv_forward_cuda_setting3_mode1_bf16bf16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, __nv_bfloat16 *__restrict__ A, __nv_bfloat16 *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    __nv_bfloat16 *__restrict__ C) {
+    int                      K_implicit = K_original * kernel_volume;
+    float                    C_warp[64];
+    __shared__ __nv_bfloat16 A_shared[5120];
+    __shared__ __nv_bfloat16 B_shared[2304];
+    __nv_bfloat16            A_shared_warp[32];
+    __nv_bfloat16            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  __nv_bfloat16* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  __nv_bfloat16* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y * 256 / 64 * N
-    + threadIdx.x * 8 / 64 * N
-    + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  __nv_bfloat16* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 2 * 32
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 64
-                        + threadIdx.x * 8 / 64;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 4;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int           *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int           *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    __nv_bfloat16 *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    __nv_bfloat16 *B_ptr = B + (blockIdx_y % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                           threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    __nv_bfloat16 *C_ptr = C +
+                           M * N * blockIdx_z
+                           //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                           //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                           //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                           + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 +
+                           threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-  {
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
     {
-
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      __nv_bfloat16* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      __nv_bfloat16* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 1024 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(uint4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 1024 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)), __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f), __float2bfloat16_rn(0.000000e+00f)));
-        }
-      }
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-      {
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(uint4*)(
-            B_ptr_local
-            + ax0_ax1_fused_0_1 * 1024 * N / 64
-          );
-
-      }
-      __syncthreads();
-
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int           *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            __nv_bfloat16 *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            __nv_bfloat16 *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                 ((((int)threadIdx.x) >> 2) * 40)) +
+                                ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(uint4 *)(A_ptr_local + input_idx * K_original +
+                                   ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+                } else {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                 ((((int)threadIdx.x) >> 2) * 40)) +
+                                ((((int)threadIdx.x) & 3) * 8))) =
+                        make_uint4(__pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)),
+                                   __pack_bfloat162(__float2bfloat16_rn(0.000000e+00f),
+                                                    __float2bfloat16_rn(0.000000e+00f)));
+                }
+            }
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    // original:
+                    // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                    // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                    *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+            }
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(A_shared[(
+                                               (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                               (i2_0_1 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 40) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-        for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-        {
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                    }
+                }
+                for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) +
+                                                         ((((int)threadIdx.y) >> 1) * 32)) +
+                                                        (ax1_0 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 72) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
-          for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-          {
+                    }
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                    for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k16.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
 #elif __CUDA_ARCH__ >= 800
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.bf16.bf16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[
-            reorder_loc_ptr[reorder_location_cur] * N
-            + ax1_0_1 * 16
-            + (local_id % 2)
-            + (local_id / 4) * 8
-          ] = __float2bfloat16(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
-      };
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + ax1_0_1 * 16 +
+                          (local_id % 2) + (local_id / 4) * 8] =
+                        __float2bfloat16(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
-
 // conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_f16f16f32_sort
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode1_f16f16f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ half A_shared[5120];
-  __shared__ half B_shared[2304];
-  half A_shared_warp[32];
-  half B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
+__global__ void
+__launch_bounds__(128) conv_forward_cuda_setting3_mode1_f16f16f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, half *__restrict__ A, half *__restrict__ B, int *__restrict__ reduced_mask,
+    int *__restrict__ out_in_map, int *__restrict__ reorder_loc, half *__restrict__ C) {
+    int             K_implicit = K_original * kernel_volume;
+    float           C_warp[64];
+    __shared__ half A_shared[5120];
+    __shared__ half B_shared[2304];
+    half            A_shared_warp[32];
+    half            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  half* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  half* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y * 256 / 64 * N
-    + threadIdx.x * 8 / 64 * N
-    + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  half* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 2 * 32
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 64
-                        + threadIdx.x * 8 / 64;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 4;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int  *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int  *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    half *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    half *B_ptr = B + (blockIdx_y % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                  threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    half *C_ptr = C +
+                  M * N * blockIdx_z
+                  //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                  //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                  //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                  + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 +
+                  threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-  {
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
     {
-
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      half* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      half* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 1024 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(uint4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 1024 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(uint4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_uint4(__pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)), __pack_half2(__float2half_rn(0.000000e+00f), __float2half_rn(0.000000e+00f)));
-        }
-      }
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-      {
-        *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(uint4*)(
-            B_ptr_local
-            + ax0_ax1_fused_0_1 * 1024 * N / 64
-          );
-
-      }
-      __syncthreads();
-
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(A_shared[((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + (i2_0_1 * 16))])) + (((((int)threadIdx.x) & 15) * 40) + ((((int)threadIdx.x) >> 4) * 8)))));
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int  *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            half *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            half *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                 ((((int)threadIdx.x) >> 2) * 40)) +
+                                ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(uint4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(uint4 *)(A_ptr_local + input_idx * K_original +
+                                   ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+                } else {
+                    *(uint4 *)(A_shared +
+                               ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                 ((((int)threadIdx.x) >> 2) * 40)) +
+                                ((((int)threadIdx.x) & 3) * 8))) =
+                        make_uint4(__pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)),
+                                   __pack_half2(__float2half_rn(0.000000e+00f),
+                                                __float2half_rn(0.000000e+00f)));
+                }
+            }
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+                *(uint4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                        ((((int)threadIdx.x) >> 3) * 72)) +
+                                       ((((int)threadIdx.x) & 7) * 8))) =
+                    // original:
+                    // *(uint4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                    // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                    *(uint4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+            }
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(A_shared[(
+                                               (((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                               (i2_0_1 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 40) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]), "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(A_shared_warp + (ax0_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-        for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-        {
-          {
-            unsigned int addr;
-            __asm__ __volatile__(
-                "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
-                : "=r"(addr)
-                : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16))])) + (((((int)threadIdx.x) & 15) * 72) + ((((int)threadIdx.x) >> 4) * 8)))));
+                    }
+                }
+                for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                    {
+                        unsigned int addr;
+                        __asm__ __volatile__(
+                            "{ .reg .u64 addr; cvta.to.shared.u64 addr, %1; cvt.u32.u64 %0, addr; }"
+                            : "=r"(addr)
+                            : "l"((void *)((&(B_shared[(((i2_0_1 * 1152) +
+                                                         ((((int)threadIdx.y) >> 1) * 32)) +
+                                                        (ax1_0 * 16))])) +
+                                           (((((int)threadIdx.x) & 15) * 72) +
+                                            ((((int)threadIdx.x) >> 4) * 8)))));
 #if __CUDA_ARCH__ >= 750
-            __asm__ __volatile__(
-                "ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
-                "{%0, %1, %2, %3}, [%4];"
-                : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]), "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
-                : "r"(addr));
+                        __asm__ __volatile__("ldmatrix.sync.aligned.m8n8.x4.trans.shared.b16"
+                                             "{%0, %1, %2, %3}, [%4];"
+                                             : "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[0]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[1]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[2]),
+                                               "=r"(((unsigned *)(B_shared_warp + (ax1_0 * 8)))[3])
+                                             : "r"(addr));
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
-          for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-          {
+                    }
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                    for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k16.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
 #elif __CUDA_ARCH__ >= 750
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
-                  "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
-            }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.f16.f16.f32"
+                                "{%0, %1, %2, %3}, {%4, %5}, {%6}, {%7, %8, %9, %10};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
 #else
-  #pragma message("FP16 kernels will not be compiled for SM75-.")
+#pragma message("FP16 kernels will not be compiled for SM75-.")
 #endif
-          }
+                    }
+                }
+            }
         }
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[
-            reorder_loc_ptr[reorder_location_cur] * N
-            + ax1_0_1 * 16
-            + (local_id % 2)
-            + (local_id / 4) * 8
-          ] = __float2half(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
-      };
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + ax1_0_1 * 16 +
+                          (local_id % 2) + (local_id / 4) * 8] =
+                        __float2half(C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id]);
+            };
+        }
     }
-  }
 }
 
-
 // conv_forward_cuda_m128n16k16_m64n16k16_m16n16k16_tf32tf32f32_sort
 template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode1_tf32tf32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_warp[32];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[640];
-  float A_shared_warp[32];
-  float B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i = 0; i < 8; ++i)
-    {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
-    };
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = (N + 15) / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 16
-      + threadIdx.x / 2;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 16) / K_tile_padded
-    + ((threadIdx.x * 8) % 16) / K_tile_padded;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  float* A_ptr = A
-    + ((threadIdx.y * 256 % 16) % K_tile_padded)
-    + ((threadIdx.x * 8 % 16) % K_tile_padded);
-  float* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  float* C_ptr = C + M * N * blockIdx_z
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
-    B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 3;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
-
-  for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile; ++i2_0_0)
-
-  {
-
-    int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
-
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting1_mode1_tf32tf32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float            C_warp[32];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[640];
+    float            A_shared_warp[32];
+    float            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        };
+    }
+
+    // hoisting shared pointer offsets
+    int  j_factors1        = (N + 15) / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 16 + threadIdx.x / 2;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 16) / K_tile_padded +
+                          ((threadIdx.x * 8) % 16) / K_tile_padded;
+    int   *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int   *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 16) % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+    float *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                   threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C + M * N * blockIdx_z + (blockIdx_x % 1) * j_factors1 * 16 +
+                   (blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx_y % j_factors1) * 16 + (threadIdx.x * 8) % 16;
+        B_ld_amount_N = max(0, min(B_ld_start + 8, N) - B_ld_start);
+        B_ld_K_bound  = K_original;
+    } else
+        B_pred_guard = 3;
+
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
+
+    for (int i2_0_0 = K_st * K_tile_padded / K_tile; i2_0_0 < K_ed * K_tile_padded / K_tile;
+         ++i2_0_0)
+
     {
+        int kernel_offset = i2_0_0 / ((K_original + K_tile - 1) / K_tile) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            if constexpr (K_ld_check) {
+                A_ld_start =
+                    (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
+                A_ld_amount  = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
+                A_ld_bound   = A_ld_amount / (K_ld_factor / 4);
+                A_pred_guard = 0;
+                for (int i = 0; i < A_ld_bound; i++)
+                    A_pred_guard |= (1 << i);
+            } else {
+                A_pred_guard = 3;
+            }
 
-      if constexpr (K_ld_check)
-      {
-        A_ld_start = (i2_0_0 * K_tile % K_tile_padded) + ((threadIdx.x * 8 % 16) % K_tile_padded);
-        A_ld_amount = max(0, min(A_ld_start + 8, K_original) - A_ld_start);
-        A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-        A_pred_guard = 0;
-        for (int i = 0; i < A_ld_bound; i++)
-          A_pred_guard |= (1 << i);
-      }
-      else
-      {
-        A_pred_guard = 3;
-      }
-
-      if constexpr (K_ld_check || N_ld_check)
-      {
-        B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
-        B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-        B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-        B_pred_guard = 0;
-        for (int i = 0; i < B_ld_bound; i++)
-          B_pred_guard |= (1 << i);
-      }
-
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
-      float* A_ptr_local = A_ptr + (i2_0_0 * 16 % K_tile_padded);
-      float* B_ptr_local;
-      if constexpr (K_ld_check)
-        B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original + i2_0_0 * K_tile % K_tile_padded) * N;
-      else
-        B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded
-        ];
-
-        if (input_idx != -1)
-        {
-          uint4 A_loaded[2] = {make_uint4(0, 0, 0, 0)};
-          global_load<K_ld_factor>(A_loaded[0], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded), A_pred_guard);
-          global_load<K_ld_factor>(A_loaded[1], A_ptr_local + input_idx * K_original + ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded) + 4, A_pred_guard >> (4 * 4 / K_ld_factor));
-          *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(A_loaded);
-        }
-        else
-        {
-          *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-        }
-      }
-
-      if (threadIdx.y == 0)
-      {
-        uint4 B_loaded[2] = {make_uint4(0, 0, 0, 0)};
-        global_load<N_ld_factor>(B_loaded[0], B_ptr_local, B_pred_guard);
-        global_load<N_ld_factor>(B_loaded[1], B_ptr_local + 4, B_pred_guard >> (4 * 4 / N_ld_factor));
-        *(ulonglong4 *)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) = *reinterpret_cast<ulonglong4 *>(B_loaded);
-      }
-
-      __syncthreads();
-
-      for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-      {
-        for (int local_size = 0; local_size < 8; ++local_size)
-        {
-          A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-        }
-      }
-      for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-      {
-        B_shared_warp[local_size_1] = B_shared[(((((local_size_1 & 3) * 160) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-      }
-
-      for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-      {
-#if __CUDA_ARCH__ >= 800
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
+            if constexpr (K_ld_check || N_ld_check) {
+                B_ld_K = ((i2_0_0 * K_tile % K_tile_padded) + threadIdx.x * 8 / 16) < B_ld_K_bound;
+                B_ld_amount  = B_ld_amount_N * (int)B_ld_K;
+                B_ld_bound   = B_ld_amount / (N_ld_factor / 4);
+                B_pred_guard = 0;
+                for (int i = 0; i < B_ld_bound; i++)
+                    B_pred_guard |= (1 << i);
+            }
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+            int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 16 / K_tile_padded;
+            float *A_ptr_local          = A_ptr + (i2_0_0 * 16 % K_tile_padded);
+            float *B_ptr_local;
+            if constexpr (K_ld_check)
+                B_ptr_local = B_ptr + (i2_0_0 * K_tile / K_tile_padded * K_original +
+                                       i2_0_0 * K_tile % K_tile_padded) *
+                                          N;
+            else
+                B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 16) / K_tile_padded];
+
+                if (input_idx != -1) {
+                    uint4 A_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                    global_load<K_ld_factor>(A_loaded[0],
+                                             A_ptr_local + input_idx * K_original +
+                                                 ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded),
+                                             A_pred_guard);
+                    global_load<K_ld_factor>(A_loaded[1],
+                                             A_ptr_local + input_idx * K_original +
+                                                 ((ax0_ax1_fused_0 * 512 % 16) % K_tile_padded) + 4,
+                                             A_pred_guard >> (4 * 4 / K_ld_factor));
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                      ((((int)threadIdx.x) >> 1) * 40)) +
+                                     ((((int)threadIdx.x) & 1) * 8))) =
+                        *reinterpret_cast<ulonglong4 *>(A_loaded);
+                } else {
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 640)) +
+                                      ((((int)threadIdx.x) >> 1) * 40)) +
+                                     ((((int)threadIdx.x) & 1) * 8))) =
+                        make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+                }
+            }
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-        }
+            if (threadIdx.y == 0) {
+                uint4 B_loaded[2] = { make_uint4(0, 0, 0, 0) };
+                global_load<N_ld_factor>(B_loaded[0], B_ptr_local, B_pred_guard);
+                global_load<N_ld_factor>(B_loaded[1], B_ptr_local + 4,
+                                         B_pred_guard >> (4 * 4 / N_ld_factor));
+                *(ulonglong4 *)(B_shared +
+                                (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                                 ((((int)threadIdx.x) & 1) * 8))) =
+                    *reinterpret_cast<ulonglong4 *>(B_loaded);
+            }
 
-        {
-          __asm__ __volatile__(
-              "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-              "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-              : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-              : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-        }
+            __syncthreads();
+
+            for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                for (int local_size = 0; local_size < 8; ++local_size) {
+                    A_shared_warp[((ax0_0 * 8) + local_size)] =
+                        A_shared[((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) +
+                                     ((local_size & 1) * 320)) +
+                                    ((((int)threadIdx.x) >> 2) * 40)) +
+                                   ((local_size >> 1) * 4)) +
+                                  (((int)threadIdx.x) & 3))];
+                }
+            }
+            for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                B_shared_warp[local_size_1] =
+                    B_shared[(((((local_size_1 & 3) * 160) + ((((int)threadIdx.x) & 3) * 40)) +
+                               ((local_size_1 >> 2) * 8)) +
+                              (((int)threadIdx.x) >> 2))];
+            }
+
+            for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+#if __CUDA_ARCH__ >= 800
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 2))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                          "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                }
+
+                {
+                    __asm__ __volatile__(
+                        "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                        "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                        : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                        : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                          "r"(((unsigned *)(B_shared_warp + 6))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                          "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
-      }
+            }
+        }
     }
-  }
-
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
 
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if constexpr (N_ld_check)
-      {
-        bool C_wb_enable = ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 + (local_id % 2) + (local_id / 4) * 8) < N;
-        if (C_wb_enable && reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-      }
-      else
-      {
-        if (reorder_location_cur < M)
-          C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-      }
-    };
-  }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if constexpr (N_ld_check) {
+                bool C_wb_enable =
+                    ((blockIdx_y % j_factors1) * 16 + threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2 +
+                     (local_id % 2) + (local_id / 4) * 8) < N;
+                if (C_wb_enable && reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+            } else {
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                          (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+            }
+        };
+    }
 }
 
-
 // conv_forward_cuda_m128n16k32_m64n16k32_m16n16k16_tf32tf32f32_sort
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode1_tf32tf32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[32];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[1280];
-  float A_shared_warp[32];
-  float B_shared_warp[8];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
-    for (int i = 0; i < 8; ++i) {
-      C_warp[(i0_0_3_init * 8) + i] = 0.0;
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting2_mode1_tf32tf32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int              K_implicit = K_original * kernel_volume;
+    float            C_warp[32];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[1280];
+    float            A_shared_warp[32];
+    float            B_shared_warp[8];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i = 0; i < 8; ++i) {
+            C_warp[(i0_0_3_init * 8) + i] = 0.0;
+        }
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  float* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  float* B_ptr = B
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y * 256 / 16 * N
-    + threadIdx.x * 8 / 16 * N
-    + (threadIdx.x * 8) % 16;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  float* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 16
-    + (blockIdx_y % j_factors1) * 16
-    + threadIdx.y / 2 * 16
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
-
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 16
-                        + threadIdx.x * 8 / 16;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 1;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int   *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int   *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    float *B_ptr = B + (blockIdx_y % j_factors1) * 16 + threadIdx.y * 256 / 16 * N +
+                   threadIdx.x * 8 / 16 * N + (threadIdx.x * 8) % 16;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C +
+                   M * N * blockIdx_z
+                   //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                   //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                   //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                   + (blockIdx_x % 1) * j_factors1 * 16 + (blockIdx_y % j_factors1) * 16 +
+                   threadIdx.y / 2 * 16 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-  {
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 16 + threadIdx.x * 8 / 16;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
     {
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            float *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            float *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 16) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 512 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                      ((((int)threadIdx.x) >> 2) * 40)) +
+                                     ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                        ((ax0_ax1_fused_0 * 512 % 32) % K_original));
+                } else {
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) +
+                                      ((((int)threadIdx.x) >> 2) * 40)) +
+                                     ((((int)threadIdx.x) & 3) * 8))) =
+                        make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+                }
+            }
 
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      float* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      float* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 16) * kernel_volume
-          + (ax0_ax1_fused_0 * 512 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(ulonglong4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(ulonglong4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 512 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(ulonglong4*)(A_shared + ((((ax0_ax1_fused_0 * 640) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+            *(ulonglong4 *)(B_shared +
+                            (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) +
+                             ((((int)threadIdx.x) & 1) * 8))) = *(ulonglong4 *)(B_ptr_local);
+
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    for (int local_size = 0; local_size < 8; ++local_size) {
+                        A_shared_warp[((ax0_0 * 8) + local_size)] =
+                            A_shared[(((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) +
+                                          ((local_size & 1) * 320)) +
+                                         ((((int)threadIdx.x) >> 2) * 40)) +
+                                        (i2_0_1 * 16)) +
+                                       ((local_size >> 1) * 4)) +
+                                      (((int)threadIdx.x) & 3))];
+                    }
+                }
+
+                for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                    B_shared_warp[local_size_1] =
+                        B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) +
+                                    ((((int)threadIdx.x) & 3) * 40)) +
+                                   ((local_size_1 >> 2) * 8)) +
+                                  (((int)threadIdx.x) >> 2))];
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+#if __CUDA_ARCH__ >= 800
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 0))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 4))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 2))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 2))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]),
+                              "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
+                    }
+
+                    {
+                        __asm__ __volatile__(
+                            "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                            "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                            : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
+                            : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                              "r"(((unsigned *)(B_shared_warp + 6))[0]),
+                              "r"(((unsigned *)(B_shared_warp + 6))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]),
+                              "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
+                    }
+#else
+#pragma message("TF32 kernels will not be compiled.")
+#endif
+                }
+            }
         }
-      }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int local_id = 0; local_id < 8; ++local_id) {
+            int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+            if (reorder_location_cur < M)
+                C_ptr[reorder_loc_ptr[reorder_location_cur] * N + (local_id % 2) +
+                      (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
+        };
+    }
+}
 
-      *(ulonglong4*)(B_shared + (((((int)threadIdx.y) * 640) + ((((int)threadIdx.x) >> 1) * 40)) + ((((int)threadIdx.x) & 1) * 8))) =
-        *(ulonglong4*)(B_ptr_local);
+// conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_tf32tf32f32_sort
+__global__ void
+__launch_bounds__(128) conv_forward_cuda_setting3_mode1_tf32tf32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int              K_implicit = K_original * kernel_volume;
+    float            C_warp[64];
+    __shared__ float A_shared[5120];
+    __shared__ float B_shared[2304];
+    float            A_shared_warp[32];
+    float            B_shared_warp[16];
+    for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init) {
+        for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init) {
+            for (int i = 0; i < 8; ++i) {
+                C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
+            };
+        }
+    }
 
+    // hoisting shared pointer offsets
+    int  j_factors1        = N / 16 / 4;
+    int  blockIdx_x        = 0;
+    int  blockIdx_y        = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
+    int  blockIdx_z        = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
+    int  out_in_map_offset = blockIdx_y / j_factors1 * 128 + threadIdx.y * 8 + threadIdx.x / 4;
+    int *out_in_map_ptr    = out_in_map + out_in_map_offset * kernel_volume +
+                          ((threadIdx.y * 256) % 32) / K_original +
+                          ((threadIdx.x * 8) % 32) / K_original;
+    int   *reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
+    int   *reorder_loc_ptr  = reorder_loc + blockIdx_z * reorder_loc_len;
+    float *A_ptr =
+        A + ((threadIdx.y * 256 % 32) % K_original) + ((threadIdx.x * 8 % 32) % K_original);
+    float *B_ptr = B + (blockIdx_y % j_factors1) * 64 + threadIdx.y * 256 / 64 * N +
+                   threadIdx.x * 8 / 64 * N + (threadIdx.x * 8) % 64;
+    int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16 + blockIdx_y / j_factors1 * 8 * 16 +
+                             (threadIdx.y % 2) * 4 * 16 + (threadIdx.x / 4);
+    float *C_ptr = C +
+                   M * N * blockIdx_z
+                   //+ blockIdx_x / 1 * 5280 * N / 16 * 256
+                   //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
+                   //+ (threadIdx.y % 2) * 4 * N / 16 * 256
+                   + (blockIdx_x % 1) * j_factors1 * 64 + (blockIdx_y % j_factors1) * 64 +
+                   threadIdx.y / 2 * 32 + (threadIdx.x % 4) * 2;
+    //+ (threadIdx.x / 4) * N;
 
-      __syncthreads();
+    // Shang: kernel offset for loading B
+    int B_kernel_offset = threadIdx.y * 256 / 64 + threadIdx.x * 8 / 64;
+    int K_st            = blockIdx_z * split_mask_len;
+    int K_ed            = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
 
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-          for (int local_size = 0; local_size < 8; ++local_size)
-          {
-            A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[(((((((((int)threadIdx.y) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + (i2_0_1 * 16)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-          }
-        }
+    for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
 
-        for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-        {
-          B_shared_warp[local_size_1] = B_shared[(((((i2_0_1 * 640) + ((local_size_1 & 3) * 160)) + ((((int)threadIdx.x) & 3) * 40)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
+    {
+        int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+
+        bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
+        if (bit_flag) {
+            int   *out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
+            float *A_ptr_local          = A_ptr + (i2_0_0 * 32 % K_original);
+            float *B_ptr_local          = B_ptr + i2_0_0 * 32 * N;
+
+            __syncthreads();
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0) {
+                // related to input
+                // Haotian: NOTE: what if j_factors[0] != 1?
+                // original:
+                // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 *
+                // 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0
+                // >> 1))];
+                long input_idx = out_in_map_ptr_local[(ax0_ax1_fused_0 * 32) * kernel_volume +
+                                                      (ax0_ax1_fused_0 * 1024 % 32) / K_original];
+
+                if (input_idx != -1) {
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                      ((((int)threadIdx.x) >> 2) * 40)) +
+                                     ((((int)threadIdx.x) & 3) * 8))) =
+                        // original
+                        //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) +
+                        //  ((((int)threadIdx.x) & 3) * 8)));
+                        *(ulonglong4 *)(A_ptr_local + input_idx * K_original +
+                                        ((ax0_ax1_fused_0 * 1024 % 32) % K_original));
+                } else {
+                    *(ulonglong4 *)(A_shared +
+                                    ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) +
+                                      ((((int)threadIdx.x) >> 2) * 40)) +
+                                     ((((int)threadIdx.x) & 3) * 8))) =
+                        make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
+                }
+            }
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+                *(ulonglong4 *)(B_shared +
+                                ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) +
+                                  ((((int)threadIdx.x) >> 3) * 72)) +
+                                 ((((int)threadIdx.x) & 7) * 8))) =
+                    // original:
+                    // *(ulonglong4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) +
+                    // (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
+                    *(ulonglong4 *)(B_ptr_local + ax0_ax1_fused_0_1 * 1024 * N / 64);
+            }
+            __syncthreads();
+
+            for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1) {
+                for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0) {
+                    for (int local_size = 0; local_size < 8; ++local_size) {
+                        A_shared_warp[((ax0_0 * 8) + local_size)] =
+                            A_shared[((((((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) +
+                                          ((local_size & 1) * 320)) +
+                                         ((((int)threadIdx.x) >> 2) * 40)) +
+                                        (i2_0_1 * 16)) +
+                                       ((local_size >> 1) * 4)) +
+                                      (((int)threadIdx.x) & 3))];
+                    }
+                }
+                for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0) {
+                    for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1) {
+                        B_shared_warp[((ax1_0 * 8) + local_size_1)] =
+                            B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) +
+                                          ((((int)threadIdx.x) & 3) * 72)) +
+                                         ((((int)threadIdx.y) >> 1) * 32)) +
+                                        (ax1_0 * 16)) +
+                                       ((local_size_1 >> 2) * 8)) +
+                                      (((int)threadIdx.x) >> 2))];
+                    }
+                }
+                for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3) {
+                    for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4) {
 #if __CUDA_ARCH__ >= 800
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 0))[0]), "r"(((unsigned *)(B_shared_warp + 0))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + 4))[0]), "r"(((unsigned *)(B_shared_warp + 4))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "=f"(((float *)(C_warp + (i0_0_3 * 8)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 2))[0]), "r"(((unsigned *)(B_shared_warp + 2))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[0]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[1]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[2]), "f"(((float *)(C_warp + (i0_0_3 * 8)))[3]));
-          }
-
-          {
-            __asm__ __volatile__(
-                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                : "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3])
-                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + 6))[0]), "r"(((unsigned *)(B_shared_warp + 6))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 8) + 4)))[3]));
-          }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]),
+                                  "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+                        }
+                        {
+                            __asm__ __volatile__(
+                                "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
+                                "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
+                                : "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "=f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "=f"(
+                                      ((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
+                                : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]),
+                                  "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]),
+                                  "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]),
+                                  "f"((
+                                      (float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]),
+                                  "f"(((float *)(C_warp +
+                                                 (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+                        }
 #else
-  #pragma message("TF32 kernels will not be compiled.")
+#pragma message("TF32 kernels will not be compiled.")
 #endif
+                    }
+                }
+            }
+        }
+    }
+    for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1) {
+        int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
+        for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1) {
+            for (int local_id = 0; local_id < 8; ++local_id) {
+                // original:
+                // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) +
+                // (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 *
+                // 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id %
+                // 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
+                int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
+                if (reorder_location_cur < M)
+                    C_ptr[reorder_loc_ptr[reorder_location_cur] * N + ax1_0_1 * 16 +
+                          (local_id % 2) + (local_id / 4) * 8] =
+                        C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id];
+            };
         }
-      }
     }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int local_id = 0; local_id < 8; ++local_id)
-    {
-      int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-      if (reorder_location_cur < M)
-        C_ptr[reorder_loc_ptr[reorder_location_cur] * N
-              + (local_id % 2) + (local_id / 4) * 8] = C_warp[(ax0_0_1 * 8) + local_id];
-    };
-  }
 }
 
+// conv_forward_cuda_m128n16k16_f32f32f32_sort
+template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting1_mode1_f32f32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int j_factors1 = (N - 1) / 16 + 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
+
+    const int K_tile        = 16;
+    int       K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
+    int       K_implicit    = K_tile_padded * kernel_volume;
+
+    float            C_local[32];
+    __shared__ float A_shared[2048];
+    __shared__ float B_shared[256];
 
-// conv_forward_cuda_m128n64k32_m64n32k32_m16n16k16_tf32tf32f32_sort
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode1_tf32tf32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-  int K_implicit = K_original * kernel_volume;
-  float C_warp[64];
-  __shared__ float A_shared[5120];
-  __shared__ float B_shared[2304];
-  float A_shared_warp[32];
-  float B_shared_warp[16];
-  for (int i0_0_3_init = 0; i0_0_3_init < 4; ++i0_0_3_init)
-  {
-    for (int i1_0_4_init = 0; i1_0_4_init < 2; ++i1_0_4_init)
-    {
-      for (int i = 0; i < 8; ++i)
-      {
-        C_warp[((i0_0_3_init * 16) + (i1_0_4_init * 8)) + i] = 0.0;
-      };
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
-  }
-
-  // hoisting shared pointer offsets
-  int j_factors1 = N / 16 / 4;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 128 - 1) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 128 - 1) / 128 * j_factors1);
-  int out_in_map_offset = blockIdx_y / j_factors1 * 128
-      + threadIdx.y * 8
-      + threadIdx.x / 4;
-  int* out_in_map_ptr = out_in_map
-    + out_in_map_offset * kernel_volume
-    + ((threadIdx.y * 256) % 32) / K_original
-    + ((threadIdx.x * 8) % 32) / K_original;
-  int* reduced_mask_ptr = reduced_mask + blockIdx_z * reduced_mask_len;
-  int* reorder_loc_ptr = reorder_loc + blockIdx_z * reorder_loc_len;
-  float* A_ptr = A
-    + ((threadIdx.y * 256 % 32) % K_original)
-    + ((threadIdx.x * 8 % 32) % K_original);
-  float* B_ptr = B
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y * 256 / 64 * N
-    + threadIdx.x * 8 / 64 * N
-    + (threadIdx.x * 8) % 64;
-  int reorder_loc_offset = blockIdx_x / 1 * 5280 * 16
-    + blockIdx_y / j_factors1 * 8 * 16
-    + (threadIdx.y % 2) * 4 * 16
-    + (threadIdx.x / 4);
-  float* C_ptr = C + M * N * blockIdx_z
-    //+ blockIdx_x / 1 * 5280 * N / 16 * 256
-    //+ blockIdx_y / j_factors1 * 8 * N / 16 * 256
-    //+ (threadIdx.y % 2) * 4 * N / 16 * 256
-    + (blockIdx_x % 1) * j_factors1 * 64
-    + (blockIdx_y % j_factors1) * 64
-    + threadIdx.y / 2 * 32
-    + (threadIdx.x % 4) * 2;
-    //+ (threadIdx.x / 4) * N;
 
-  // Shang: kernel offset for loading B
-  int B_kernel_offset =  threadIdx.y * 256 / 64
-                        + threadIdx.x * 8 / 64;
-  int K_st = blockIdx_z * split_mask_len;
-  int K_ed = min(kernel_volume, (blockIdx_z + 1) * split_mask_len);
+    int K_loops_all = K_implicit / 16;
+
+    int block_k_iter_start = blockIdx_z * split_mask_len * (K_tile_padded / 16);
+    int block_k_iter_end =
+        min(block_k_iter_start + split_mask_len * (K_tile_padded / 16), K_loops_all);
+
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
+
+    // hoisting shared pointer offsets
+    int *reorder_loc_block  = reorder_loc + blockIdx_z * reorder_loc_len;
+    int *reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
+
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (16 / 4))) * kernel_volume;
+
+    float *B_ptr = B + (threadIdx_x / (16 / 4)) * N + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 16);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
+
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
+    // + blockIdx_n * 16 + (threadIdx_x % 4);
+    int    location_offset  = blockIdx_m * 128 + (threadIdx_x / 4); // C_m_offset
+    int    C_n_offset       = blockIdx_n * 16 + (threadIdx_x % 4);
+    float *C_block          = C + blockIdx_z * M * N;
+    int    channel_offset_A = ((threadIdx_x * 4) % 16);
+
+    // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original
+    // % 2)) + 1;
+    // TODO: A_ld_start related to k_0
+    int  A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
+    int  B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
+    bool B_ld_K;
+    if constexpr (N_ld_check || K_ld_check) {
+        B_ld_start    = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+        B_ld_amount_N = max(0, min(B_ld_start + 4, N) - B_ld_start);
+        // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
+        B_ld_K_bound = K_original;
+    } else
+        B_pred_guard = 1;
 
-  for (int i2_0_0 = K_st * K_original / 32; i2_0_0 < K_ed * K_original / 32; ++i2_0_0)
+#pragma unroll
+    for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0) {
+        int  kernel_offset = k_0 / (K_tile_padded / K_tile);
+        int  bitmask_shift = kernel_offset - blockIdx_z * split_mask_len;
+        bool bit_flag      = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
+        if (bit_flag) {
+            if constexpr (K_ld_check) {
+                A_ld_start =
+                    (k_0 * K_tile % K_tile_padded) + ((threadIdx.x * 4) % 16); // Channel_offset
+                A_ld_amount  = max(0, min(A_ld_start + 4, K_original) - A_ld_start);
+                A_ld_bound   = A_ld_amount / (K_ld_factor / 4);
+                A_pred_guard = 0;
+                for (int i = 0; i < A_ld_bound; i++)
+                    A_pred_guard |= (1 << i);
+            } else {
+                A_pred_guard = 1;
+            }
 
-  {
+            if constexpr (K_ld_check || N_ld_check) {
+                B_ld_K = ((k_0 * K_tile % K_tile_padded) + threadIdx.x * 4 / 16) < B_ld_K_bound;
+                B_ld_amount  = B_ld_amount_N * (int)B_ld_K;
+                B_ld_bound   = B_ld_amount / (N_ld_factor / 4);
+                B_pred_guard = 0;
+                for (int i = 0; i < B_ld_bound; i++)
+                    B_pred_guard |= (1 << i);
+            }
 
-    int kernel_offset = i2_0_0 / (K_original / 32) - K_st;
+            int   *out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
+            float *A_ptr_local          = A + (k_0 * 16 % K_tile_padded) + channel_offset_A;
 
-    bool bit_flag = (bool)(reduced_mask_ptr[blockIdx_y / j_factors1] & (1 << kernel_offset));
-    if (bit_flag)
-    {
+            // float *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
+            float *B_ptr_local;
+            if constexpr (K_ld_check)
+                B_ptr_local =
+                    B_ptr +
+                    (k_0 * K_tile / K_tile_padded * K_original + k_0 * K_tile % K_tile_padded) * N;
+            else
+                B_ptr_local = B_ptr + k_0 * K_tile * N;
 
-      int* out_in_map_ptr_local = out_in_map_ptr + i2_0_0 * 32 / K_original;
-      float* A_ptr_local = A_ptr + (i2_0_0 * 32 % K_original);
-      float* B_ptr_local = B_ptr + i2_0_0 * 32 * N;
-
-      __syncthreads();
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 4; ++ax0_ax1_fused_0)
-      {
-
-        // related to input
-        // Haotian: NOTE: what if j_factors[0] != 1?
-        // original:
-        // int input_idx = out_in_map[(((((((int)blockIdx_y) * 3456) + (ax0_ax1_fused_0 * 864)) + (((int)threadIdx.y) * 216)) + ((((int)threadIdx.x) >> 2) * 27)) + (i2_0_0 >> 1))];
-        long input_idx = out_in_map_ptr_local[
-          (ax0_ax1_fused_0 * 32) * kernel_volume
-          + (ax0_ax1_fused_0 * 1024 % 32) / K_original
-        ];
-
-        if (input_idx != -1)
-        {
-          *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) =
-              // original
-              //  *(ulonglong4*)(A + (((input_idx * 64) + ((i2_0_0 & 1) * 32)) + ((((int)threadIdx.x) & 3) * 8)));
-              *(ulonglong4*)(
-                A_ptr_local
-                + input_idx * K_original
-                + ((ax0_ax1_fused_0 * 1024 % 32) % K_original)
-              );
-        }
-        else
-        {
-          *(ulonglong4 *)(A_shared + ((((ax0_ax1_fused_0 * 1280) + (((int)threadIdx.y) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + ((((int)threadIdx.x) & 3) * 8))) = make_ulonglong4(0ULL, 0ULL, 0ULL, 0ULL);
-        }
-      }
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-      {
-        *(ulonglong4 *)(B_shared + ((((ax0_ax1_fused_0_1 * 1152) + (((int)threadIdx.y) * 288)) + ((((int)threadIdx.x) >> 3) * 72)) + ((((int)threadIdx.x) & 7) * 8))) =
-          // original:
-          // *(ulonglong4*)(B + ((((i2_0_0 * 2048) + (ax0_ax1_fused_0_1 * 1024)) + (((int)threadIdx.y) * 256)) + (((int)threadIdx.x) * 8)));
-          *(ulonglong4*)(
-            B_ptr_local
-            + ax0_ax1_fused_0_1 * 1024 * N / 64
-          );
-
-      }
-      __syncthreads();
-
-      for (int i2_0_1 = 0; i2_0_1 < 2; ++i2_0_1)
-      {
-        for (int ax0_0 = 0; ax0_0 < 4; ++ax0_0)
-        {
-          for (int local_size = 0; local_size < 8; ++local_size)
-          {
-            A_shared_warp[((ax0_0 * 8) + local_size)] = A_shared[((((((((((int)threadIdx.y) & 1) * 2560) + (ax0_0 * 640)) + ((local_size & 1) * 320)) + ((((int)threadIdx.x) >> 2) * 40)) + (i2_0_1 * 16)) + ((local_size >> 1) * 4)) + (((int)threadIdx.x) & 3))];
-          }
-        }
-        for (int ax1_0 = 0; ax1_0 < 2; ++ax1_0)
-        {
-          for (int local_size_1 = 0; local_size_1 < 8; ++local_size_1)
-          {
-            B_shared_warp[((ax1_0 * 8) + local_size_1)] = B_shared[(((((((i2_0_1 * 1152) + ((local_size_1 & 3) * 288)) + ((((int)threadIdx.x) & 3) * 72)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0 * 16)) + ((local_size_1 >> 2) * 8)) + (((int)threadIdx.x) >> 2))];
-          }
-        }
-        for (int i0_0_3 = 0; i0_0_3 < 4; ++i0_0_3)
-        {
-          for (int i1_0_4 = 0; i1_0_4 < 2; ++i1_0_4)
-          {
-#if __CUDA_ARCH__ >= 800
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[0]), "r"(((unsigned *)(B_shared_warp + (i1_0_4 * 8)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
-            }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[0]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[1]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[2]), "r"(((unsigned *)(A_shared_warp + (i0_0_3 * 8)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 * 16) * kernel_volume);
+                if (input_idx != -1) {
+                    // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 *
+                    // elements loaded in each loop
+                    //     *(float4*)(A + (input_idx * K_original) + channel_offset);
+                    uint4 A_loaded = make_uint4(0, 0, 0, 0);
+                    global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original),
+                                             A_pred_guard);
+                    *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
+                } else {
+                    // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0,
+                    // 0.0, 0.0);
+                    *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
+                }
             }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "=f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 2)))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[0]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[1]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[2]), "f"(((float *)(C_warp + ((i0_0_3 * 16) + (i1_0_4 * 8))))[3]));
+
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1) {
+                // *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 //
+                // ax0_ax1_fused_0_1 * elements loaded in each loop
+                //       *(float4*)(B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N);
+                uint4 B_loaded = make_uint4(0, 0, 0, 0);
+                global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N,
+                                         B_pred_guard);
+                *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
             }
-            {
-              __asm__ __volatile__(
-                  "mma.sync.aligned.m16n8k8.row.col.f32.tf32.tf32.f32"
-                  "{%0, %1, %2, %3}, {%4, %5, %6, %7}, {%8, %9}, {%10, %11, %12, %13};"
-                  : "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "=f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3])
-                  : "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[0]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[1]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[2]), "r"(((unsigned *)(A_shared_warp + ((i0_0_3 * 8) + 4)))[3]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[0]), "r"(((unsigned *)(B_shared_warp + ((i1_0_4 * 8) + 6)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[0]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[1]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[2]), "f"(((float *)(C_warp + (((i0_0_3 * 16) + (i1_0_4 * 8)) + 4)))[3]));
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (16 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 32; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                    }
+                }
             }
-#else
-  #pragma message("TF32 kernels will not be compiled.")
-#endif
-          }
         }
-      }
-    }
-  }
-  for (int ax0_0_1 = 0; ax0_0_1 < 4; ++ax0_0_1)
-  {
-
-    int reorder_loc_offset_local = reorder_loc_offset + ax0_0_1 * 16;
-    for (int ax1_0_1 = 0; ax1_0_1 < 2; ++ax1_0_1)
-    {
-      for (int local_id = 0; local_id < 8; ++local_id)
-      {
-
-        // original:
-        // (&(C[(((((((int)blockIdx_y) * 8192) + ((((int)threadIdx.y) & 1) * 4096)) + (ax0_0_1 * 1024)) + ((((int)threadIdx.y) >> 1) * 32)) + (ax1_0_1 * 16))]))[((((((local_id / 2) % 2) * 8) + (threadIdx.x / 4)) * 64) + (((local_id % 2) + ((local_id / 4) * 8)) + ((threadIdx.x % 4) * 2)))]
-        int reorder_location_cur = reorder_loc_offset_local + (((local_id / 2) % 2) * 8);
-        if (reorder_location_cur < M)
-          C_ptr[
-            reorder_loc_ptr[reorder_location_cur] * N
-            + ax1_0_1 * 16
-            + (local_id % 2)
-            + (local_id / 4) * 8
-          ] = C_warp[((ax0_0_1 * 16) + (ax1_0_1 * 8)) + local_id];
-      };
     }
-  }
-}
 
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int location_cur = location_offset + ((i / 4) * 16);
+        int vn           = C_n_offset + ((i % 4) * 4);
 
-// conv_forward_cuda_m128n16k16_f32f32f32_sort
-template <int K_ld_factor, int N_ld_factor, bool K_ld_check, bool N_ld_check>
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting1_mode1_f32f32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-
-  int j_factors1 = (N - 1) / 16 + 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
-
-  const int K_tile = 16;
-  int K_tile_padded = K_tile * ((K_original + K_tile - 1) / K_tile);
-  int K_implicit = K_tile_padded * kernel_volume;
-
-  float C_local[32];
-  __shared__ float A_shared[2048];
-  __shared__ float B_shared[256];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops_all = K_implicit / 16;
-
-  int block_k_iter_start = blockIdx_z * split_mask_len * (K_tile_padded / 16);
-  int block_k_iter_end = min(block_k_iter_start + split_mask_len * (K_tile_padded / 16), K_loops_all);
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * reorder_loc_block = reorder_loc + blockIdx_z * reorder_loc_len;
-  int * reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
-
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (16/4)))* kernel_volume;
-
-  float * B_ptr = B
-                  + (threadIdx_x / (16/4)) * N
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 16);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
-                      // + blockIdx_n * 16 + (threadIdx_x % 4);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 4);  // C_m_offset
-  int C_n_offset = blockIdx_n * 16  + (threadIdx_x % 4);
-  float * C_block = C + blockIdx_z * M * N;
-  int channel_offset_A = ((threadIdx_x * 4) % 16);
-
-  // const int K_ld_factor = (8 * !(K_original % 8)) + (4 * !(K_original % 4)) + (2 * !(K_original % 2)) + 1;
-  // TODO: A_ld_start related to k_0
-  int A_ld_start, A_ld_amount, A_ld_bound, A_pred_guard;
-  int B_ld_start, B_ld_amount, B_ld_bound, B_pred_guard, B_ld_amount_N, B_ld_K_bound;
-  bool B_ld_K;
-  if constexpr (N_ld_check || K_ld_check)
-  {
-    B_ld_start = (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-    B_ld_amount_N = max(0, min(B_ld_start + 4, N) - B_ld_start);
-    // B_ld_K_bound = (K_original % 16) ? (K_original % 16) : 16;
-    B_ld_K_bound = K_original;
-  }
-  else
-    B_pred_guard = 1;
-
-  #pragma unroll
-  for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0)
-  {
-    int kernel_offset = k_0 / (K_tile_padded / K_tile);
-    int bitmask_shift = kernel_offset - blockIdx_z * split_mask_len;
-    bool bit_flag = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
-    if (bit_flag)
-    {
-      if constexpr (K_ld_check)
-      {
-        A_ld_start = (k_0 * K_tile % K_tile_padded) + ((threadIdx.x * 4) % 16); // Channel_offset
-        A_ld_amount = max(0, min(A_ld_start + 4, K_original) - A_ld_start);
-        A_ld_bound = A_ld_amount / (K_ld_factor / 4);
-        A_pred_guard = 0;
-        for (int i = 0; i < A_ld_bound; i++)
-          A_pred_guard |= (1 << i);
-      }
-      else
-      {
-        A_pred_guard = 1;
-      }
-
-      if constexpr (K_ld_check || N_ld_check)
-      {
-        B_ld_K = ((k_0 * K_tile % K_tile_padded) + threadIdx.x * 4 / 16) < B_ld_K_bound;
-        B_ld_amount = B_ld_amount_N * (int)B_ld_K;
-        B_ld_bound = B_ld_amount / (N_ld_factor / 4);
-        B_pred_guard = 0;
-        for (int i = 0; i < B_ld_bound; i++)
-          B_pred_guard |= (1 << i);
-      }
-
-      int* out_in_map_ptr_local = out_in_map_ptr + k_0 * 16 / K_tile_padded;
-      float* A_ptr_local = A  + (k_0 * 16 % K_tile_padded) + channel_offset_A;
-
-      // float *B_ptr_local = B_ptr + i2_0_0 * K_tile * N;
-      float* B_ptr_local;
-      if constexpr (K_ld_check)
-        B_ptr_local = B_ptr + (k_0 * K_tile / K_tile_padded * K_original + k_0 * K_tile % K_tile_padded) * N;
-      else
-        B_ptr_local = B_ptr + k_0 * K_tile * N;
-
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-
-        long input_idx = *(out_in_map_ptr_local + (ax0_ax1_fused_0 *16) * kernel_volume);
-        if (input_idx != -1)
-        {
-          // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-          //     *(float4*)(A + (input_idx * K_original) + channel_offset);
-          uint4 A_loaded = make_uint4(0, 0, 0, 0);
-          global_load<K_ld_factor>(A_loaded, A_ptr_local + (input_idx * K_original) , A_pred_guard);
-          *(uint4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = A_loaded;
-        }
-        else
-        {
-          // *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-          *(uint4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_uint4(0, 0, 0, 0);
+        if constexpr (N_ld_check) {
+            if (vn < N && location_cur < M)
+                C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
+        } else {
+            if (location_cur < M)
+                C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
         }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 1; ++ax0_ax1_fused_0_1)
-      {
-        // *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-        //       *(float4*)(B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N);
-        uint4 B_loaded = make_uint4(0, 0, 0, 0);
-        global_load<N_ld_factor>(B_loaded, B_ptr_local + (ax0_ax1_fused_0_1 * 16) * N, B_pred_guard);
-        *(uint4 *)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) = B_loaded;
-      }
-
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 16 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
-        {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 32; ++i)
-          {
-            C_local[i] = C_local[i] +
-                            A_shared_reduce_ptr[((i / 4) * 16) * 16 + vk_in_block]
-                            * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-
-          }
-        }
-      }
     }
-  }
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4);
-
-      if constexpr (N_ld_check)
-      {
-        if (vn < N && location_cur < M)
-          C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
-      }
-      else
-      {
-        if (location_cur < M)
-          C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
-      }
-  }
 }
 
 // conv_forward_cuda_m128n16k32_f32f32f32_sort
-__global__ void __launch_bounds__(64) conv_forward_cuda_setting2_mode1_f32f32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-
-  int j_factors1 = (N - 1) / 16 + 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
-
-  float C_local[32];
-  __shared__ float A_shared[4096];
-  __shared__ float B_shared[512];
-
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops_all = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_k_iter_start = blockIdx_z * split_mask_len * (K_original / 32);
-  int block_k_iter_end = min(block_k_iter_start + split_mask_len * (K_original / 32), K_loops_all);
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * reorder_loc_block = reorder_loc + blockIdx_z * reorder_loc_len;
-  int * reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
-
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
-
-  float * B_ptr = B
-                  + (threadIdx_x / (16/4)) * N
-                  + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 4) * 32);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
-                      // + blockIdx_n * 16 + (threadIdx_x % 4);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 4);  // C_m_offset
-  int C_n_offset = blockIdx_n * 16  + (threadIdx_x % 4);
-  float * C_block = C + blockIdx_z * M * N;
-
-  int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
-
-  #pragma unroll
-  for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0)
-  {
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
-    int kernel_offset = k_0 / (K_original / 32);
-    int bitmask_shift = kernel_offset - blockIdx_z * split_mask_len;
-    int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
-
-    bool bit_flag = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
-    if (bit_flag)
-    {
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0)
-      {
-        long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *8) * kernel_volume);
-        if (input_idx != -1)
-        {
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-              *(float4*)(A + (input_idx * K_original) + channel_offset);
-        }
-        else {
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) = make_float4(0.0, 0.0, 0.0, 0.0);
-        }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1)
-      {
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 256)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
-      }
-
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
-        {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 32; ++i)
-          {
-            C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block]
-                          * B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
-
-          }
-        }
-      }
+__global__ void
+__launch_bounds__(64) conv_forward_cuda_setting2_mode1_f32f32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int j_factors1 = (N - 1) / 16 + 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
+
+    float            C_local[32];
+    __shared__ float A_shared[4096];
+    __shared__ float B_shared[512];
+
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        C_local[i] = 0.0;
     }
-  }
 
-  #pragma unroll
-  for (int i = 0; i < 32; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 16);
-      int vn = C_n_offset + ((i % 4) * 4);
+    int K_loops_all        = (K_original * kernel_volume - 1) / 32 + 1;
+    int block_k_iter_start = blockIdx_z * split_mask_len * (K_original / 32);
+    int block_k_iter_end =
+        min(block_k_iter_start + split_mask_len * (K_original / 32), K_loops_all);
 
-      if (location_cur < M)
-        C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
-  }
-}
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
 
-// conv_forward_cuda_m128n64k32_f32f32f32_sort
-__global__ void __launch_bounds__(128) conv_forward_cuda_setting3_mode1_f32f32f32(int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len, int reorder_loc_len, float *__restrict__ A, float *__restrict__ B, int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc, float *__restrict__ C)
-{
-
-  int j_factors1 = (N - 1) / 64 + 1;
-  int blockIdx_x = 0;
-  int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
-  int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
-
-  float C_local[64];
-  __shared__ float A_shared[4096];
-  __shared__ float B_shared[2048];
-
-  #pragma unroll
-  for (int i = 0; i < 64; ++i)
-  {
-    C_local[i] = 0.0;
-  }
-
-  int K_loops_all = (K_original * kernel_volume - 1) / 32 + 1;
-  int block_k_iter_start = blockIdx_z * split_mask_len * (K_original / 32);
-  int block_k_iter_end = min(block_k_iter_start + split_mask_len * (K_original / 32), K_loops_all);
-
-  int blockIdx_m = blockIdx_y / j_factors1;
-  int blockIdx_n = blockIdx_y % j_factors1;
-  int threadIdx_x = (int)threadIdx.x;
-
-  // hoisting shared pointer offsets
-  int * reorder_loc_block = reorder_loc + blockIdx_z * reorder_loc_len;
-  int * reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
-
-  int * out_in_map_ptr = out_in_map
-                         + (blockIdx_m * 128 + (threadIdx_x / (32/4)))* kernel_volume;
-
-
-  float * B_ptr = B
-                  + (threadIdx_x / (64/4)) * N
-                  + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
-
-  float * A_shared_ptr = A_shared + (threadIdx_x * 4);
-  float * A_shared_reduce_ptr =  A_shared + ((threadIdx_x / 16) * 32);
-  float * B_shared_ptr = B_shared + (threadIdx_x * 4);
-  float * B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
-
-  // float * C_ptr = C
-                      // // + (blockIdx_m * 128 + (threadIdx_x / 16)) * N
-                      // + blockIdx_n * 64 + (threadIdx_x % 16);
-  int location_offset = blockIdx_m * 128 + (threadIdx_x / 16);  // C_m_offset
-  int C_n_offset = blockIdx_n * 64  + (threadIdx_x % 16);
-  float * C_block = C + blockIdx_z * M * N;
-
-  int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
-
-  #pragma unroll
-  for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0)
-  {
-    int channel_offset = k_0 % (K_original / 32) * 32 + channel_offset_A;
-    int kernel_offset = k_0 / (K_original / 32);
-    int bitmask_shift = kernel_offset - blockIdx_z * split_mask_len;
-    int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
-
-    bool bit_flag = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
-    if (bit_flag)
-    {
-      __syncthreads();
-      #pragma unroll
-      for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0)
-      {
-        long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 *16) * kernel_volume);
-        if (input_idx != -1)
-        {
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =  // ax0_ax1_fused_0 * elements loaded in each loop
-              *(float4*)(A + (input_idx * K_original) + channel_offset);
-        }
-        else {
-          *(float4*)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) = make_float4(0.0, 0.0, 0.0, 0.0);
-        }
-      }
-
-      #pragma unroll
-      for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1)
-      {
-        *(float4*)(B_shared_ptr + (ax0_ax1_fused_0_1 * 512)) =                 // ax0_ax1_fused_0_1 * elements loaded in each loop
-              *(float4*)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
-      }
-
-      __syncthreads();
-      #pragma unroll
-      for (int k_1 = 0; k_1 < ( 32 / 4); ++k_1)
-      {
-        #pragma unroll
-        for (int k_2 = 0; k_2 < 4; ++k_2)
-        {
-          int vk_in_block = (k_1 << 2) + k_2;
-          #pragma unroll
-          for (int i = 0; i < 64; ++i)
-          {
-            C_local[i] = C_local[i] +
-                          A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block]
-                          * B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
-
-          }
+    // hoisting shared pointer offsets
+    int *reorder_loc_block  = reorder_loc + blockIdx_z * reorder_loc_len;
+    int *reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
+
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (32 / 4))) * kernel_volume;
+
+    float *B_ptr = B + (threadIdx_x / (16 / 4)) * N + (blockIdx_n * 16) + ((threadIdx_x * 4) % 16);
+
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 4) * 32);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 4);
+
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 4)) * N
+    // + blockIdx_n * 16 + (threadIdx_x % 4);
+    int    location_offset = blockIdx_m * 128 + (threadIdx_x / 4); // C_m_offset
+    int    C_n_offset      = blockIdx_n * 16 + (threadIdx_x % 4);
+    float *C_block         = C + blockIdx_z * M * N;
+
+    int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
+
+#pragma unroll
+    for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0) {
+        int  channel_offset   = k_0 % (K_original / 32) * 32 + channel_offset_A;
+        int  kernel_offset    = k_0 / (K_original / 32);
+        int  bitmask_shift    = kernel_offset - blockIdx_z * split_mask_len;
+        int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
+
+        bool bit_flag = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
+        if (bit_flag) {
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 16; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 * 8) * kernel_volume);
+                if (input_idx != -1) {
+                    *(float4 *)(A_shared_ptr +
+                                (ax0_ax1_fused_0 *
+                                 256)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                        *(float4 *)(A + (input_idx * K_original) + channel_offset);
+                } else {
+                    *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 256)) =
+                        make_float4(0.0, 0.0, 0.0, 0.0);
+                }
+            }
+
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 2; ++ax0_ax1_fused_0_1) {
+                *(float4 *)(B_shared_ptr +
+                            (ax0_ax1_fused_0_1 *
+                             256)) = // ax0_ax1_fused_0_1 * elements loaded in each loop
+                    *(float4 *)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 16)) * N);
+            }
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (32 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 32; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 16) * 32 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 16) + ((i % 4) * 4)];
+                    }
+                }
+            }
         }
-      }
     }
-  }
 
-  #pragma unroll
-  for (int i = 0; i < 64; ++i)
-  {
-      int location_cur = location_offset + ((i / 4) * 8);
-      int vn = C_n_offset + ((i % 4) * 16);
+#pragma unroll
+    for (int i = 0; i < 32; ++i) {
+        int location_cur = location_offset + ((i / 4) * 16);
+        int vn           = C_n_offset + ((i % 4) * 4);
 
-      if (location_cur < M)
-        C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
-  }
+        if (location_cur < M)
+            C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
+    }
 }
 
-template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMSorted<torch::kCUDA>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, torch::Tensor _reduced_mask,
-    torch::Tensor _reorder_loc,
-    int num_out_feats, int num_out_channels,
-    bool allow_tf32, bool allow_fp16)
-{
-  bool is_tf = allow_tf32;
-  int num_in_feats = _in_feats.size(0);
-  int num_in_channels = _in_feats.size(1);
-  int kernel_volume = _out_in_map.size(1);
-
-  int split_mask_num = _reduced_mask.size(0);
-  int split_mask_len = (kernel_volume + split_mask_num - 1) / split_mask_num;
-  int reduced_mask_len = _reduced_mask.size(1);
-  int reorder_loc_len = _reorder_loc.size(1);
-
-  auto options =
-      torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
-  at::Tensor _out_feats;
-  if (split_mask_num != 1)
-    _out_feats = torch::empty({split_mask_num, num_out_feats, num_out_channels}, options);
-  else
-    _out_feats = torch::empty({num_out_feats, num_out_channels}, options);
-  auto reduced_mask = _reduced_mask.data_ptr<int>();
-  auto out_in_map = _out_in_map.data_ptr<int>();
-  auto reorder_loc = _reorder_loc.data_ptr<int>();
-  bool is_half = _in_feats.scalar_type() == at::ScalarType::Half;
-  bool is_bfloat16 = _in_feats.scalar_type() == at::ScalarType::BFloat16;
-  if (is_bfloat16) {
-    //if (!allow_bfloat16) TODO(cfujitsang)
-    auto in_feats = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
-    auto kernel = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
-    auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+// conv_forward_cuda_m128n64k32_f32f32f32_sort
+__global__ void
+__launch_bounds__(128) conv_forward_cuda_setting3_mode1_f32f32f32(
+    int M, int K_original, int N, int kernel_volume, int split_mask_len, int reduced_mask_len,
+    int reorder_loc_len, float *__restrict__ A, float *__restrict__ B,
+    int *__restrict__ reduced_mask, int *__restrict__ out_in_map, int *__restrict__ reorder_loc,
+    float *__restrict__ C) {
+    int j_factors1 = (N - 1) / 64 + 1;
+    int blockIdx_x = 0;
+    int blockIdx_y = blockIdx.x % ((M + 127) / 128 * j_factors1);
+    int blockIdx_z = blockIdx.x / ((M + 127) / 128 * j_factors1);
+
+    float            C_local[64];
+    __shared__ float A_shared[4096];
+    __shared__ float B_shared[2048];
+
+#pragma unroll
+    for (int i = 0; i < 64; ++i) {
+        C_local[i] = 0.0;
     }
-    else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+
+    int K_loops_all        = (K_original * kernel_volume - 1) / 32 + 1;
+    int block_k_iter_start = blockIdx_z * split_mask_len * (K_original / 32);
+    int block_k_iter_end =
+        min(block_k_iter_start + split_mask_len * (K_original / 32), K_loops_all);
+
+    int blockIdx_m  = blockIdx_y / j_factors1;
+    int blockIdx_n  = blockIdx_y % j_factors1;
+    int threadIdx_x = (int)threadIdx.x;
+
+    // hoisting shared pointer offsets
+    int *reorder_loc_block  = reorder_loc + blockIdx_z * reorder_loc_len;
+    int *reduced_mask_block = reduced_mask + blockIdx_z * reduced_mask_len;
+
+    int *out_in_map_ptr =
+        out_in_map + (blockIdx_m * 128 + (threadIdx_x / (32 / 4))) * kernel_volume;
+
+    float *B_ptr = B + (threadIdx_x / (64 / 4)) * N + (blockIdx_n * 64) + ((threadIdx_x * 4) % 64);
+
+    float *A_shared_ptr        = A_shared + (threadIdx_x * 4);
+    float *A_shared_reduce_ptr = A_shared + ((threadIdx_x / 16) * 32);
+    float *B_shared_ptr        = B_shared + (threadIdx_x * 4);
+    float *B_shared_reduce_ptr = B_shared + (threadIdx_x % 16);
+
+    // float * C_ptr = C
+    // // + (blockIdx_m * 128 + (threadIdx_x / 16)) * N
+    // + blockIdx_n * 64 + (threadIdx_x % 16);
+    int    location_offset = blockIdx_m * 128 + (threadIdx_x / 16); // C_m_offset
+    int    C_n_offset      = blockIdx_n * 64 + (threadIdx_x % 16);
+    float *C_block         = C + blockIdx_z * M * N;
+
+    int channel_offset_A = ((threadIdx_x * 4) % 32); // mod K_tile=32
+
+#pragma unroll
+    for (int k_0 = block_k_iter_start; k_0 < block_k_iter_end; ++k_0) {
+        int  channel_offset   = k_0 % (K_original / 32) * 32 + channel_offset_A;
+        int  kernel_offset    = k_0 / (K_original / 32);
+        int  bitmask_shift    = kernel_offset - blockIdx_z * split_mask_len;
+        int *out_in_map_ptr_k = out_in_map_ptr + kernel_offset;
+
+        bool bit_flag = (bool)(reduced_mask_block[blockIdx_m] & (1 << bitmask_shift));
+        if (bit_flag) {
+            __syncthreads();
+#pragma unroll
+            for (int ax0_ax1_fused_0 = 0; ax0_ax1_fused_0 < 8; ++ax0_ax1_fused_0) {
+                long input_idx = *(out_in_map_ptr_k + (ax0_ax1_fused_0 * 16) * kernel_volume);
+                if (input_idx != -1) {
+                    *(float4 *)(A_shared_ptr +
+                                (ax0_ax1_fused_0 *
+                                 512)) = // ax0_ax1_fused_0 * elements loaded in each loop
+                        *(float4 *)(A + (input_idx * K_original) + channel_offset);
+                } else {
+                    *(float4 *)(A_shared_ptr + (ax0_ax1_fused_0 * 512)) =
+                        make_float4(0.0, 0.0, 0.0, 0.0);
+                }
+            }
+
+#pragma unroll
+            for (int ax0_ax1_fused_0_1 = 0; ax0_ax1_fused_0_1 < 4; ++ax0_ax1_fused_0_1) {
+                *(float4 *)(B_shared_ptr +
+                            (ax0_ax1_fused_0_1 *
+                             512)) = // ax0_ax1_fused_0_1 * elements loaded in each loop
+                    *(float4 *)(B_ptr + ((k_0 * 32) + (ax0_ax1_fused_0_1 * 8)) * N);
+            }
+
+            __syncthreads();
+#pragma unroll
+            for (int k_1 = 0; k_1 < (32 / 4); ++k_1) {
+#pragma unroll
+                for (int k_2 = 0; k_2 < 4; ++k_2) {
+                    int vk_in_block = (k_1 << 2) + k_2;
+#pragma unroll
+                    for (int i = 0; i < 64; ++i) {
+                        C_local[i] = C_local[i] +
+                                     A_shared_reduce_ptr[((i / 4) * 8) * 32 + vk_in_block] *
+                                         B_shared_reduce_ptr[(vk_in_block * 64) + ((i % 4) * 16)];
+                    }
+                }
+            }
         }
-      }
-    }
-  }
-  else if (is_half)
-  {
-    // throw std::runtime_error("FP16 kernels have not been updated for split mask implimentation.");
-    if (!allow_fp16)
-    {
-      throw std::runtime_error("FP16 kernels are not supported for implicit GEMM now for SM75-.");
     }
-    auto in_feats = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
-    auto kernel = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
-    auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
 
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+#pragma unroll
+    for (int i = 0; i < 64; ++i) {
+        int location_cur = location_offset + ((i / 4) * 8);
+        int vn           = C_n_offset + ((i % 4) * 16);
+
+        if (location_cur < M)
+            C_block[reorder_loc_block[location_cur] * N + vn] = C_local[i];
     }
+}
+
+template <>
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMSorted<torch::kCUDA>(
+    torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _out_in_map,
+    torch::Tensor _reduced_mask, torch::Tensor _reorder_loc, int num_out_feats,
+    int num_out_channels, bool allow_tf32, bool allow_fp16) {
+    bool is_tf           = allow_tf32;
+    int  num_in_feats    = _in_feats.size(0);
+    int  num_in_channels = _in_feats.size(1);
+    int  kernel_volume   = _out_in_map.size(1);
+
+    int split_mask_num   = _reduced_mask.size(0);
+    int split_mask_len   = (kernel_volume + split_mask_num - 1) / split_mask_num;
+    int reduced_mask_len = _reduced_mask.size(1);
+    int reorder_loc_len  = _reorder_loc.size(1);
+
+    auto       options = torch::TensorOptions().dtype(_in_feats.dtype()).device(_in_feats.device());
+    at::Tensor _out_feats;
+    if (split_mask_num != 1)
+        _out_feats = torch::empty({ split_mask_num, num_out_feats, num_out_channels }, options);
     else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 2, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 8 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<16, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<8, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<4, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<2, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 8 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<2, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<2, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+        _out_feats = torch::empty({ num_out_feats, num_out_channels }, options);
+    auto reduced_mask = _reduced_mask.data_ptr<int>();
+    auto out_in_map   = _out_in_map.data_ptr<int>();
+    auto reorder_loc  = _reorder_loc.data_ptr<int>();
+    bool is_half      = _in_feats.scalar_type() == at::ScalarType::Half;
+    bool is_bfloat16  = _in_feats.scalar_type() == at::ScalarType::BFloat16;
+    if (is_bfloat16) {
+        // if (!allow_bfloat16) TODO(cfujitsang)
+        auto in_feats  = reinterpret_cast<__nv_bfloat16 *>(_in_feats.data_ptr<at::BFloat16>());
+        auto kernel    = reinterpret_cast<__nv_bfloat16 *>(_kernel.data_ptr<at::BFloat16>());
+        auto out_feats = reinterpret_cast<__nv_bfloat16 *>(_out_feats.data_ptr<at::BFloat16>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode1_bf16bf16f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_bf16bf16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
         }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<2, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+    } else if (is_half) {
+        // throw std::runtime_error("FP16 kernels have not been updated for split mask
+        // implimentation.");
+        if (!allow_fp16) {
+            throw std::runtime_error(
+                "FP16 kernels are not supported for implicit GEMM now for SM75-.");
+        }
+        auto in_feats  = reinterpret_cast<half *>(_in_feats.data_ptr<at::Half>());
+        auto kernel    = reinterpret_cast<half *>(_kernel.data_ptr<at::Half>());
+        auto out_feats = reinterpret_cast<half *>(_out_feats.data_ptr<at::Half>());
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode1_f16f16f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 2, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 8 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<16, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<8, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<4, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<2, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 8 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<2, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<2, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<2, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f16f16f32<2, 2, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
         }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f16f16f32<2, 2, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+    } else if (is_tf) {
+        // throw std::runtime_error("TF32 kernels have not been updated for split mask
+        // implimentation.");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 4;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 4);
+            conv_forward_cuda_setting3_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  j_factors1 = num_out_channels / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            conv_forward_cuda_setting2_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask,
+                out_in_map, reorder_loc, out_feats);
+        } else {
+            // throw std::invalid_argument("IC is too small for this kernel");
+            int  j_factors1 = (num_out_channels + 15) / 16 / 1;
+            dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
+            // threadIdx.x: 32
+            // threadIdx.y: i_factors[2] * j_factors[2]
+            dim3 threads_per_block(32, 2);
+            // conv_forward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats,
+            //     kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats,
+                            kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
         }
-      }
-    }
-  }
-  else if (is_tf)
-  {
-    //throw std::runtime_error("TF32 kernels have not been updated for split mask implimentation.");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int j_factors1 = num_out_channels / 16 / 4;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 4);
-      conv_forward_cuda_setting3_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
+    } else // fp32fp32fp32
     {
-      int j_factors1 = num_out_channels / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      conv_forward_cuda_setting2_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-        num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else
-    {
-      // throw std::invalid_argument("IC is too small for this kernel");
-      int j_factors1 = (num_out_channels + 15) / 16 / 1;
-      dim3 num_blocks(1 * (num_out_feats + 127) / 128 * j_factors1 * split_mask_num);
-      // threadIdx.x: 32
-      // threadIdx.y: i_factors[2] * j_factors[2]
-      dim3 threads_per_block(32, 2);
-      // conv_forward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     _out_feats.size(0), num_in_channels, num_out_channels, kernel_volume, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_tf32tf32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, _reduced_mask.size(1), _reorder_loc.size(1), in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
+        // printf("\n Run sorted FP32 kernel! \n");
+        auto in_feats  = _in_feats.data_ptr<float>();
+        auto kernel    = _kernel.data_ptr<float>();
+        auto out_feats = _out_feats.data_ptr<float>();
+
+        if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0) {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = num_out_channels / 64; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
+            dim3 threads_per_block(128);
+            conv_forward_cuda_setting3_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map,
+                reorder_loc, out_feats);
+        } else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0) {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = num_out_channels / 16; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
+            dim3 threads_per_block(64);
+            conv_forward_cuda_setting2_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
+                num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+                reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map,
+                reorder_loc, out_feats);
+        } else {
+            int  block_num_M = (num_out_feats + 127) / 128;
+            int  block_num_N = (num_out_channels + 15) / 16; // j_factors1
+            dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
+            dim3 threads_per_block(64);
+            // conv_forward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
+            //     num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len,
+            //     reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map,
+            //     reorder_loc, out_feats);
+
+            if (num_in_channels % 16 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, false, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 8, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 4, false, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 4 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<16, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else if (num_in_channels % 2 == 0) {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<8, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<8, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<8, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<8, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            } else {
+                if (num_out_channels % 16 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<4, 16, true, false>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 4 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<4, 16, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else if (num_out_channels % 2 == 0) {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<4, 8, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                } else {
+                    conv_forward_cuda_setting1_mode1_f32f32f32<4, 4, true, true>
+                        <<<num_blocks, threads_per_block>>>(
+                            num_out_feats, num_in_channels, num_out_channels, kernel_volume,
+                            split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel,
+                            reduced_mask, out_in_map, reorder_loc, out_feats);
+                }
+            }
         }
-      }
-    }
-  }
-  else //fp32fp32fp32
-  {
-    // printf("\n Run sorted FP32 kernel! \n");
-    auto in_feats = _in_feats.data_ptr<float>();
-    auto kernel = _kernel.data_ptr<float>();
-    auto out_feats = _out_feats.data_ptr<float>();
-
-    if (num_out_channels % 64 == 0 && num_in_channels % 32 == 0)
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = num_out_channels / 64;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
-      dim3 threads_per_block(128);
-      conv_forward_cuda_setting3_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
-          num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-    }
-    else if (num_in_channels % 32 == 0 && num_out_channels % 16 == 0)
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = num_out_channels / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
-      dim3 threads_per_block(64);
-      conv_forward_cuda_setting2_mode1_f32f32f32<<<num_blocks, threads_per_block>>>(
-          num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
     }
+    if (split_mask_num != 1)
+        return _out_feats.sum(0);
     else
-    {
-      int block_num_M = (num_out_feats + 127) / 128;
-      int block_num_N = (num_out_channels + 15) / 16;  //j_factors1
-      dim3 num_blocks(block_num_M * block_num_N * split_mask_num);
-      dim3 threads_per_block(64);
-      // conv_forward_cuda_setting1_mode1_tf32tf32f32<<<num_blocks, threads_per_block>>>(
-      //     num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-
-      if (num_in_channels % 16 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, false, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 8, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 4, false, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 4 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<16, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else if (num_in_channels % 2 == 0)
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<8, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<8, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<8, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<8, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-      else
-      {
-        if (num_out_channels % 16 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<4, 16, true, false><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 4 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<4, 16, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else if (num_out_channels % 2 == 0)
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<4, 8, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-        else
-        {
-          conv_forward_cuda_setting1_mode1_f32f32f32<4, 4, true, true><<<num_blocks, threads_per_block>>>(
-              num_out_feats, num_in_channels, num_out_channels, kernel_volume, split_mask_len, reduced_mask_len, reorder_loc_len, in_feats, kernel, reduced_mask, out_in_map, reorder_loc, out_feats);
-        }
-      }
-    }
-  }
-  if (split_mask_num != 1)
-    return _out_feats.sum(0);
-  else
-    return _out_feats;
+        return _out_feats;
 }
 
 template <>
-torch::Tensor dispatchSparseConvolutionImplicitGEMMSorted<torch::kCPU>(
-    torch::Tensor _in_feats, torch::Tensor _kernel,
-    torch::Tensor _out_in_map, torch::Tensor _reduced_mask,
-    torch::Tensor _reorder_loc,
-    int num_out_feats, int num_out_channels,
-    bool allow_tf32, bool allow_fp16) {
+torch::Tensor
+dispatchSparseConvolutionImplicitGEMMSorted<torch::kCPU>(
+    torch::Tensor _in_feats, torch::Tensor _kernel, torch::Tensor _out_in_map,
+    torch::Tensor _reduced_mask, torch::Tensor _reorder_loc, int num_out_feats,
+    int num_out_channels, bool allow_tf32, bool allow_fp16) {
     TORCH_CHECK(false, "No support for CPU-based ImplicitGEMM!");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
index b25734bdd2..20cfeac315 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionKernelMap.cu
@@ -1,56 +1,61 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include "ConvOps.h"
+
+#include <detail/ops/Ops.h>
+
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
 #include <c10/cuda/CUDAGuard.h>
+#include <torch/extension.h>
 
 #include <algorithm>
 #include <chrono>
 
-#include "detail/ops/Ops.h"
-#include "detail/ops/convolution/backend/ConvOps.h"
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename scalar_t>
-__global__ void gatherKernel(const int n_k, const int n_in, const int c,
-                              const scalar_t *in_feat, scalar_t *out_feat,
-                              const int *kmap, const bool transpose) {
+__global__ void
+gatherKernel(const int n_k, const int n_in, const int c, const scalar_t *in_feat,
+             scalar_t *out_feat, const int *kmap, const bool transpose) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = index / c;
-    int j = index % c;
-    if (i >= n_k) return;
+    int i     = index / c;
+    int j     = index % c;
+    if (i >= n_k)
+        return;
     int in_pos = kmap[2 * i + transpose];
-    if (in_pos < 0) return;
+    if (in_pos < 0)
+        return;
     out_feat[i * c + j] = in_feat[in_pos * c + j];
 }
 
 template <typename scalar_t>
-__global__ void scatterKernel(const int n_in, const int n_out, const int c,
-                               const scalar_t *in_feat, scalar_t *out_feat,
-                               const int *kmap, const bool transpose) {
+__global__ void
+scatterKernel(const int n_in, const int n_out, const int c, const scalar_t *in_feat,
+              scalar_t *out_feat, const int *kmap, const bool transpose) {
     int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int i = index / c;
-    int j = index % c;
-    if (i >= n_in) return;
+    int i     = index / c;
+    int j     = index % c;
+    if (i >= n_in)
+        return;
     int out_pos = kmap[2 * i + 1 - transpose];
-    if (out_pos < 0) return;
+    if (out_pos < 0)
+        return;
     out_feat[out_pos * c + j] += in_feat[i * c + j];
 }
 
 template <typename scalar_t>
-static void scatterCpu(const int n_in, const int n_out, const int c,
-                 const scalar_t *in_feat, scalar_t *out_feat, const int *kmap,
-                 const bool transpose) {
+static void
+scatterCpu(const int n_in, const int n_out, const int c, const scalar_t *in_feat,
+           scalar_t *out_feat, const int *kmap, const bool transpose) {
     for (int i = 0; i < n_in; i++) {
         int out_pos = kmap[2 * i + 1 - transpose];
         if (out_pos < 0) {
             continue;
         }
-        #pragma omp parallel for
+#pragma omp parallel for
         for (int j = 0; j < c; j++) {
             out_feat[out_pos * c + j] += in_feat[i * c + j];
         }
@@ -58,9 +63,9 @@ static void scatterCpu(const int n_in, const int n_out, const int c,
 }
 
 template <typename scalar_t>
-static void gatherCpu(const int n_k, const int n_in, const int c,
-                const scalar_t *in_feat, scalar_t *out_feat, const int *kmap,
-                const bool transpose) {
+static void
+gatherCpu(const int n_k, const int n_in, const int c, const scalar_t *in_feat, scalar_t *out_feat,
+          const int *kmap, const bool transpose) {
     for (int i = 0; i < n_k; i++) {
         int in_pos = kmap[2 * i + transpose];
         if (in_pos < 0) {
@@ -73,7 +78,6 @@ static void gatherCpu(const int n_k, const int n_in, const int c,
     }
 }
 
-
 // in_feat: (N, c) N=# of input points, c = input channels
 // out_feat: (M, o) M=# of output points, o = output channels
 //                  for stride=1, M=N. For stride>1, the N input coords
@@ -89,20 +93,24 @@ static void gatherCpu(const int n_k, const int n_in, const int c,
 //                      with unused weights having 0 and neighbor_offset[k^3/2]
 //                      holding w[0,0].
 template <>
-void dispatchSparseConvolutionKernelMap<torch::kCUDA>(at::Tensor in_feat, at::Tensor out_feat,
-                                                    at::Tensor kernel, at::Tensor neighbor_map,
-                                                    at::Tensor neighbor_offset,
-                                                    const bool transpose,
-                                                    const bool middleAcceleration) {
+void
+dispatchSparseConvolutionKernelMap<torch::kCUDA>(at::Tensor in_feat, at::Tensor out_feat,
+                                                 at::Tensor kernel, at::Tensor neighbor_map,
+                                                 at::Tensor neighbor_offset, const bool transpose,
+                                                 const bool middleAcceleration) {
     TORCH_CHECK(in_feat.device().is_cuda(), "in_feat must be a CUDA tensor");
     TORCH_CHECK(in_feat.device().has_index(), "in_feat must have a device index");
-    TORCH_CHECK(in_feat.device() == out_feat.device(), "All tensors must be on the same device, got in_feat.device() = ",
-                in_feat.device(), ", out_feat.device() = ", out_feat.device());
-    TORCH_CHECK(in_feat.device() == kernel.device(), "All tensors must be on the same device, got in_feat.device() = ",
-                in_feat.device(), ", kernel.device() = ", kernel.device());
-    TORCH_CHECK(in_feat.device() == neighbor_map.device(), "All tensors must be on the same device, got in_feat.device() = ",
-                in_feat.device(), ", neighbor_map.device() = ", neighbor_map.device());
-    TORCH_CHECK(neighbor_offset.device().is_cpu(), "neighborhood_offset must be on the CPU because torch_sparse conv is wack");
+    TORCH_CHECK(in_feat.device() == out_feat.device(),
+                "All tensors must be on the same device, got in_feat.device() = ", in_feat.device(),
+                ", out_feat.device() = ", out_feat.device());
+    TORCH_CHECK(in_feat.device() == kernel.device(),
+                "All tensors must be on the same device, got in_feat.device() = ", in_feat.device(),
+                ", kernel.device() = ", kernel.device());
+    TORCH_CHECK(in_feat.device() == neighbor_map.device(),
+                "All tensors must be on the same device, got in_feat.device() = ", in_feat.device(),
+                ", neighbor_map.device() = ", neighbor_map.device());
+    TORCH_CHECK(neighbor_offset.device().is_cpu(),
+                "neighborhood_offset must be on the CPU because torch_sparse conv is wack");
 
     c10::cuda::CUDAGuard deviceGuard(in_feat.device());
 
@@ -112,42 +120,38 @@ void dispatchSparseConvolutionKernelMap<torch::kCUDA>(at::Tensor in_feat, at::Te
 
     bool is_half = in_feat.scalar_type() == at::ScalarType::Half;
 
-    int n_in_feats = in_feat.size(0);
-    int n_in_channels = in_feat.size(1);
-    int n_out_feats = out_feat.size(0);
+    int n_in_feats     = in_feat.size(0);
+    int n_in_channels  = in_feat.size(1);
+    int n_out_feats    = out_feat.size(0);
     int n_out_channels = out_feat.size(1);
 
     int kernel_volume = kernel.size(0);
 
     // memory optimization
     bool precompute_mid = false;
-    int mid_kernel = kernel_volume / 2;
-    int in_buffer_size = 1;
+    int  mid_kernel     = kernel_volume / 2;
+    int  in_buffer_size = 1;
     // we can precompute features for w[0,0] which avoids gather/scatter
     if (kernel_volume % 2 == 1 && n_in_feats == n_out_feats && middleAcceleration) {
         precompute_mid = true;
-        in_buffer_size =
-                *std::max_element(neighbor_offset.data_ptr<int>(),
-                                  neighbor_offset.data_ptr<int>() + mid_kernel);
+        in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                           neighbor_offset.data_ptr<int>() + mid_kernel);
         in_buffer_size = std::max(
-                in_buffer_size,
-                *std::max_element(neighbor_offset.data_ptr<int>() + mid_kernel + 1,
-                                  neighbor_offset.data_ptr<int>() + kernel_volume));
+            in_buffer_size, *std::max_element(neighbor_offset.data_ptr<int>() + mid_kernel + 1,
+                                              neighbor_offset.data_ptr<int>() + kernel_volume));
         in_buffer_size = std::max(in_buffer_size, 1);
 
         // (N, c) X (c, o) = (N, o)
         torch::mm_out(out_feat, in_feat, kernel[mid_kernel]);
     } else {
-        in_buffer_size =
-                *std::max_element(neighbor_offset.data_ptr<int>(),
-                                  neighbor_offset.data_ptr<int>() + kernel_volume);
+        in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                           neighbor_offset.data_ptr<int>() + kernel_volume);
     }
 
-    auto options =
-            torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
-    auto in_buffer = torch::zeros({in_buffer_size, n_in_channels}, options);
-    auto out_buffer = torch::zeros({in_buffer_size, n_out_channels}, options);
-    int cur_offset = 0;
+    auto options    = torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
+    auto in_buffer  = torch::zeros({ in_buffer_size, n_in_channels }, options);
+    auto out_buffer = torch::zeros({ in_buffer_size, n_out_channels }, options);
+    int  cur_offset = 0;
     // gather/gemm/scatter on each weight
     for (int i = 0; i < kernel_volume; i++) {
         int n_active_feats = neighbor_offset.data_ptr<int>()[i];
@@ -169,33 +173,28 @@ void dispatchSparseConvolutionKernelMap<torch::kCUDA>(at::Tensor in_feat, at::Te
         at::Tensor out_buffer_activated;
         at::Tensor in_buffer_activated;
         if (is_half) {
-            out_buffer_activated =
-                    torch::from_blob(out_buffer.data_ptr<at::Half>(),
-                                     {n_active_feats, n_out_channels}, options);
-            in_buffer_activated =
-                    torch::from_blob(in_buffer.data_ptr<at::Half>(),
-                                     {n_active_feats, n_in_channels}, options);
+            out_buffer_activated = torch::from_blob(out_buffer.data_ptr<at::Half>(),
+                                                    { n_active_feats, n_out_channels }, options);
+            in_buffer_activated  = torch::from_blob(in_buffer.data_ptr<at::Half>(),
+                                                    { n_active_feats, n_in_channels }, options);
         } else {
-            out_buffer_activated =
-                    torch::from_blob(out_buffer.data_ptr(),
-                                     {n_active_feats, n_out_channels}, options);
+            out_buffer_activated = torch::from_blob(out_buffer.data_ptr(),
+                                                    { n_active_feats, n_out_channels }, options);
             in_buffer_activated =
-                    torch::from_blob(in_buffer.data_ptr(),
-                                     {n_active_feats, n_in_channels}, options);
+                torch::from_blob(in_buffer.data_ptr(), { n_active_feats, n_in_channels }, options);
         }
 
         // gather n_active_feats dense features from N sparse input features with c
         // feature dimensions
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "convolution_forward_cuda", ([&] {
-            gatherKernel<scalar_t>
-            <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
-                    n_active_feats, n_in_feats, n_in_channels,
-                    in_feat.data_ptr<scalar_t>(),
-                    in_buffer_activated.data_ptr<scalar_t>(),
-                    neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        }));
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(),
+            "convolution_forward_cuda", ([&] {
+                gatherKernel<scalar_t>
+                    <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
+                        n_active_feats, n_in_feats, n_in_channels, in_feat.data_ptr<scalar_t>(),
+                        in_buffer_activated.data_ptr<scalar_t>(),
+                        neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            }));
 
         // gemm: (i, c) X (c, o) = (i, o)
         torch::mm_out(out_buffer_activated, in_buffer_activated, kernel[i]);
@@ -203,34 +202,39 @@ void dispatchSparseConvolutionKernelMap<torch::kCUDA>(at::Tensor in_feat, at::Te
         // scatter n_active_feats dense features into n_out_feats output features of
         // dimension n_out_channels
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "convolution_forward_cuda", ([&] {
-            scatterKernel<scalar_t>
-            <<<ceil((double)(n_active_feats * n_out_channels) / 256), 256>>>(
-                    n_active_feats, n_out_feats, n_out_channels,
-                    out_buffer_activated.data_ptr<scalar_t>(),
-                    out_feat.data_ptr<scalar_t>(),
-                    neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        }));
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(),
+            "convolution_forward_cuda", ([&] {
+                scatterKernel<scalar_t>
+                    <<<ceil((double)(n_active_feats * n_out_channels) / 256), 256>>>(
+                        n_active_feats, n_out_feats, n_out_channels,
+                        out_buffer_activated.data_ptr<scalar_t>(), out_feat.data_ptr<scalar_t>(),
+                        neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            }));
 
         cur_offset += 2 * n_active_feats;
     }
 }
 
 template <>
-void dispatchSparseConvolutionKernelMapGrad<torch::kCUDA>(at::Tensor in_feat, at::Tensor grad_in_feat,
-                                                        at::Tensor grad_out_feat, at::Tensor kernel,
-                                                        at::Tensor grad_kernel, at::Tensor neighbor_map,
-                                                        at::Tensor neighbor_offset,
-                                                        const bool transpose) {
+void
+dispatchSparseConvolutionKernelMapGrad<torch::kCUDA>(at::Tensor in_feat, at::Tensor grad_in_feat,
+                                                     at::Tensor grad_out_feat, at::Tensor kernel,
+                                                     at::Tensor grad_kernel,
+                                                     at::Tensor neighbor_map,
+                                                     at::Tensor neighbor_offset,
+                                                     const bool transpose) {
     TORCH_CHECK(in_feat.device().is_cuda(), "in_feat must be a CUDA tensor");
     TORCH_CHECK(in_feat.device().has_index(), "in_feat must have a device index");
-    TORCH_CHECK(in_feat.device() == grad_in_feat.device(), "All tensors must be on the same device");
-    TORCH_CHECK(in_feat.device() == grad_out_feat.device(), "All tensors must be on the same device");
+    TORCH_CHECK(in_feat.device() == grad_in_feat.device(),
+                "All tensors must be on the same device");
+    TORCH_CHECK(in_feat.device() == grad_out_feat.device(),
+                "All tensors must be on the same device");
     TORCH_CHECK(in_feat.device() == kernel.device(), "All tensors must be on the same device");
     TORCH_CHECK(in_feat.device() == grad_kernel.device(), "All tensors must be on the same device");
-    TORCH_CHECK(in_feat.device() == neighbor_map.device(), "All tensors must be on the same device");
-    TORCH_CHECK(neighbor_offset.device().is_cpu(), "neighborhood_offset must be on the CPU because torch_sparse conv is wack");
+    TORCH_CHECK(in_feat.device() == neighbor_map.device(),
+                "All tensors must be on the same device");
+    TORCH_CHECK(neighbor_offset.device().is_cpu(),
+                "neighborhood_offset must be on the CPU because torch_sparse conv is wack");
 
     c10::cuda::CUDAGuard deviceGuard(in_feat.device());
 
@@ -239,32 +243,28 @@ void dispatchSparseConvolutionKernelMapGrad<torch::kCUDA>(at::Tensor in_feat, at
     grad_kernel.resize_as_(kernel);
     grad_kernel.zero_();
 
-    bool is_half = in_feat.scalar_type() == at::ScalarType::Half;
-    bool is_bfloat16 = in_feat.scalar_type() == at::ScalarType::BFloat16;
-    int n_in_feats = in_feat.size(0);
-    int n_in_channels = in_feat.size(1);
-    int n_out_feats = grad_out_feat.size(0);
-    int n_out_channels = kernel.size(-1);
+    bool is_half        = in_feat.scalar_type() == at::ScalarType::Half;
+    bool is_bfloat16    = in_feat.scalar_type() == at::ScalarType::BFloat16;
+    int  n_in_feats     = in_feat.size(0);
+    int  n_in_channels  = in_feat.size(1);
+    int  n_out_feats    = grad_out_feat.size(0);
+    int  n_out_channels = kernel.size(-1);
 
-    int kernel_volume = kernel.size(0);
-    bool flag = false;
-    int in_buffer_size;
-    in_buffer_size =
-            *std::max_element(neighbor_offset.data_ptr<int>(),
-                              neighbor_offset.data_ptr<int>() + kernel_volume);
-
-    auto options =
-            torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
-    auto in_buffer = torch::zeros({in_buffer_size, in_feat.size(1)}, options);
-    auto in_grad_buffer =
-            torch::zeros({in_buffer_size, in_feat.size(1)}, options);
-    auto out_grad_buffer =
-            torch::zeros({in_buffer_size, kernel.size(2)}, options);
+    int  kernel_volume = kernel.size(0);
+    bool flag          = false;
+    int  in_buffer_size;
+    in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                       neighbor_offset.data_ptr<int>() + kernel_volume);
+
+    auto options         = torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
+    auto in_buffer       = torch::zeros({ in_buffer_size, in_feat.size(1) }, options);
+    auto in_grad_buffer  = torch::zeros({ in_buffer_size, in_feat.size(1) }, options);
+    auto out_grad_buffer = torch::zeros({ in_buffer_size, kernel.size(2) }, options);
 
     int cur_offset = 0;
     for (int i = 0; i < kernel_volume; i++) {
         auto kernel_grad_buffer = grad_kernel[i];
-        int n_active_feats = neighbor_offset.data_ptr<int>()[i];
+        int  n_active_feats     = neighbor_offset.data_ptr<int>()[i];
         if (flag && (i == kernel_volume / 2)) {
             cur_offset += 2 * n_active_feats;
             continue;
@@ -279,128 +279,112 @@ void dispatchSparseConvolutionKernelMapGrad<torch::kCUDA>(at::Tensor in_feat, at
         at::Tensor in_grad_buffer_activated;
         at::Tensor in_buffer_activated;
         if (is_half) {
-            out_grad_buffer_activated =
-                    torch::from_blob(out_grad_buffer.data_ptr<at::Half>(),
-                                     {n_active_feats, kernel.size(2)}, options);
-            in_grad_buffer_activated =
-                    torch::from_blob(in_grad_buffer.data_ptr<at::Half>(),
-                                     {n_active_feats, in_feat.size(1)}, options);
-            in_buffer_activated =
-                    torch::from_blob(in_buffer.data_ptr<at::Half>(),
-                                     {n_active_feats, in_feat.size(1)}, options);
+            out_grad_buffer_activated = torch::from_blob(
+                out_grad_buffer.data_ptr<at::Half>(), { n_active_feats, kernel.size(2) }, options);
+            in_grad_buffer_activated = torch::from_blob(
+                in_grad_buffer.data_ptr<at::Half>(), { n_active_feats, in_feat.size(1) }, options);
+            in_buffer_activated = torch::from_blob(in_buffer.data_ptr<at::Half>(),
+                                                   { n_active_feats, in_feat.size(1) }, options);
         } else if (is_bfloat16) {
             out_grad_buffer_activated =
-                    torch::from_blob(out_grad_buffer.data_ptr<at::BFloat16>(),
-                                     {n_active_feats, kernel.size(2)}, options);
+                torch::from_blob(out_grad_buffer.data_ptr<at::BFloat16>(),
+                                 { n_active_feats, kernel.size(2) }, options);
             in_grad_buffer_activated =
-                    torch::from_blob(in_grad_buffer.data_ptr<at::BFloat16>(),
-                                     {n_active_feats, in_feat.size(1)}, options);
-            in_buffer_activated =
-                    torch::from_blob(in_buffer.data_ptr<at::BFloat16>(),
-                                     {n_active_feats, in_feat.size(1)}, options);
+                torch::from_blob(in_grad_buffer.data_ptr<at::BFloat16>(),
+                                 { n_active_feats, in_feat.size(1) }, options);
+            in_buffer_activated = torch::from_blob(in_buffer.data_ptr<at::BFloat16>(),
+                                                   { n_active_feats, in_feat.size(1) }, options);
         } else {
-            out_grad_buffer_activated =
-                    torch::from_blob(out_grad_buffer.data_ptr(),
-                                     {n_active_feats, kernel.size(2)}, options);
-            in_grad_buffer_activated =
-                    torch::from_blob(in_grad_buffer.data_ptr(),
-                                     {n_active_feats, in_feat.size(1)}, options);
-            in_buffer_activated =
-                    torch::from_blob(in_buffer.data_ptr(),
-                                     {n_active_feats, in_feat.size(1)}, options);
+            out_grad_buffer_activated = torch::from_blob(
+                out_grad_buffer.data_ptr(), { n_active_feats, kernel.size(2) }, options);
+            in_grad_buffer_activated = torch::from_blob(
+                in_grad_buffer.data_ptr(), { n_active_feats, in_feat.size(1) }, options);
+            in_buffer_activated = torch::from_blob(in_buffer.data_ptr(),
+                                                   { n_active_feats, in_feat.size(1) }, options);
         }
 
         // gather
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "convolution_forward_cuda", ([&] {
-            gatherKernel<scalar_t>
-            <<<ceil((double)(n_active_feats * n_out_channels) / 256), 256>>>(
-                    n_active_feats, n_out_feats, n_out_channels,
-                    grad_out_feat.data_ptr<scalar_t>(),
-                    out_grad_buffer_activated.data_ptr<scalar_t>(),
-                    neighbor_map.data_ptr<int>() + cur_offset, !transpose);
-        }));
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(),
+            "convolution_forward_cuda", ([&] {
+                gatherKernel<scalar_t>
+                    <<<ceil((double)(n_active_feats * n_out_channels) / 256), 256>>>(
+                        n_active_feats, n_out_feats, n_out_channels,
+                        grad_out_feat.data_ptr<scalar_t>(),
+                        out_grad_buffer_activated.data_ptr<scalar_t>(),
+                        neighbor_map.data_ptr<int>() + cur_offset, !transpose);
+            }));
 
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "convolution_forward_cuda", ([&] {
-            gatherKernel<scalar_t>
-            <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
-                    n_active_feats, n_in_feats, n_in_channels,
-                    in_feat.data_ptr<scalar_t>(),
-                    in_buffer_activated.data_ptr<scalar_t>(),
-                    neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        }));
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(),
+            "convolution_forward_cuda", ([&] {
+                gatherKernel<scalar_t>
+                    <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
+                        n_active_feats, n_in_feats, n_in_channels, in_feat.data_ptr<scalar_t>(),
+                        in_buffer_activated.data_ptr<scalar_t>(),
+                        neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            }));
 
         // gemm
         torch::mm_out(in_grad_buffer_activated, out_grad_buffer_activated,
                       torch::transpose(kernel[i], 0, 1));
-        torch::mm_out(kernel_grad_buffer,
-                      torch::transpose(in_buffer_activated, 0, 1),
+        torch::mm_out(kernel_grad_buffer, torch::transpose(in_buffer_activated, 0, 1),
                       out_grad_buffer_activated);
 
         // scatter
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "convolution_forward_cuda", ([&] {
-            scatterKernel<scalar_t>
-            <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
-                    n_active_feats, n_in_feats, n_in_channels,
-                    in_grad_buffer_activated.data_ptr<scalar_t>(),
-                    grad_in_feat.data_ptr<scalar_t>(),
-                    neighbor_map.data_ptr<int>() + cur_offset, !transpose);
-        }));
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(),
+            "convolution_forward_cuda", ([&] {
+                scatterKernel<scalar_t>
+                    <<<ceil((double)(n_active_feats * n_in_channels) / 256), 256>>>(
+                        n_active_feats, n_in_feats, n_in_channels,
+                        in_grad_buffer_activated.data_ptr<scalar_t>(),
+                        grad_in_feat.data_ptr<scalar_t>(),
+                        neighbor_map.data_ptr<int>() + cur_offset, !transpose);
+            }));
 
         cur_offset += 2 * n_active_feats;
     }
 }
 
-
-
-
 template <>
-void dispatchSparseConvolutionKernelMap<torch::kCPU>(torch::Tensor in_feat, torch::Tensor out_feat,
-                                                   torch::Tensor kernel, torch::Tensor neighbor_map,
-                                                   torch::Tensor neighbor_offset,
-                                                   bool transpose,
-                                                   bool middleAcceleration) {
+void
+dispatchSparseConvolutionKernelMap<torch::kCPU>(torch::Tensor in_feat, torch::Tensor out_feat,
+                                                torch::Tensor kernel, torch::Tensor neighbor_map,
+                                                torch::Tensor neighbor_offset, bool transpose,
+                                                bool middleAcceleration) {
     if (in_feat.size(1) != kernel.size(1)) {
         throw std::invalid_argument("Input feature size and kernel size mismatch");
     }
 
     int out_nrows = out_feat.size(0);
-    out_feat.resize_({out_nrows, kernel.size(2)});
+    out_feat.resize_({ out_nrows, kernel.size(2) });
     out_feat.zero_();
 
-    int kernel_volume = kernel.size(0);
-    int in_buffer_size = 1;
-    bool flag = false;
+    int  kernel_volume  = kernel.size(0);
+    int  in_buffer_size = 1;
+    bool flag           = false;
     // memory optimization
     if (kernel_volume % 2 && out_nrows == in_feat.size(0) && middleAcceleration) {
-        flag = true;
+        flag           = true;
+        in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                           neighbor_offset.data_ptr<int>() + kernel_volume / 2);
         in_buffer_size =
-                *std::max_element(neighbor_offset.data_ptr<int>(),
-                                  neighbor_offset.data_ptr<int>() + kernel_volume / 2);
-        in_buffer_size =
-                std::max(in_buffer_size,
-                         *std::max_element(
-                                 neighbor_offset.data_ptr<int>() + kernel_volume / 2 + 1,
-                                 neighbor_offset.data_ptr<int>() + kernel_volume));
+            std::max(in_buffer_size,
+                     *std::max_element(neighbor_offset.data_ptr<int>() + kernel_volume / 2 + 1,
+                                       neighbor_offset.data_ptr<int>() + kernel_volume));
         in_buffer_size = std::max(in_buffer_size, 1);
 
         torch::mm_out(out_feat, in_feat, kernel[kernel_volume / 2]);
     } else {
-        in_buffer_size =
-                *std::max_element(neighbor_offset.data_ptr<int>(),
-                                  neighbor_offset.data_ptr<int>() + kernel_volume);
+        in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                           neighbor_offset.data_ptr<int>() + kernel_volume);
     }
 
-    auto options =
-            torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
-    auto in_buffer = torch::zeros({in_buffer_size, in_feat.size(1)}, options);
-    auto out_buffer = torch::zeros({in_buffer_size, kernel.size(2)}, options);
-    int cur_offset = 0;
+    auto options    = torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
+    auto in_buffer  = torch::zeros({ in_buffer_size, in_feat.size(1) }, options);
+    auto out_buffer = torch::zeros({ in_buffer_size, kernel.size(2) }, options);
+    int  cur_offset = 0;
     for (int i = 0; i < kernel_volume; i++) {
         if (flag && (i == kernel_volume / 2)) {
             cur_offset += 2 * neighbor_offset.data_ptr<int>()[i];
@@ -412,63 +396,55 @@ void dispatchSparseConvolutionKernelMap<torch::kCPU>(torch::Tensor in_feat, torc
         }
 
         auto out_buffer_activated = torch::from_blob(
-                out_buffer.data_ptr(),
-                {neighbor_offset.data_ptr<int>()[i], kernel.size(2)}, options);
+            out_buffer.data_ptr(), { neighbor_offset.data_ptr<int>()[i], kernel.size(2) }, options);
         auto in_buffer_activated = torch::from_blob(
-                in_buffer.data_ptr(),
-                {neighbor_offset.data_ptr<int>()[i], in_feat.size(1)}, options);
+            in_buffer.data_ptr(), { neighbor_offset.data_ptr<int>()[i], in_feat.size(1) }, options);
 
         // gather
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          in_feat.scalar_type(), "gatherCpu", [&]() {
-            gatherCpu(in_buffer_activated.size(0), in_feat.size(0), kernel.size(1),
-                       in_feat.data_ptr<scalar_t>(), in_buffer_activated.data_ptr<scalar_t>(),
-                       neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, in_feat.scalar_type(), "gatherCpu",
+            [&]() {
+                gatherCpu(in_buffer_activated.size(0), in_feat.size(0), kernel.size(1),
+                          in_feat.data_ptr<scalar_t>(), in_buffer_activated.data_ptr<scalar_t>(),
+                          neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            });
 
         // matmul
         torch::mm_out(out_buffer_activated, in_buffer_activated, kernel[i]);
 
         // scatter
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          out_feat.scalar_type(), "scatterCpu", [&](){
-            scatterCpu(neighbor_offset.data_ptr<int>()[i], out_nrows, kernel.size(2),
-                        out_buffer_activated.data_ptr<scalar_t>(),
-                        out_feat.data_ptr<scalar_t>(),
-                        neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, out_feat.scalar_type(), "scatterCpu",
+            [&]() {
+                scatterCpu(neighbor_offset.data_ptr<int>()[i], out_nrows, kernel.size(2),
+                           out_buffer_activated.data_ptr<scalar_t>(), out_feat.data_ptr<scalar_t>(),
+                           neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            });
         cur_offset += 2 * neighbor_offset.data_ptr<int>()[i];
     }
 }
 
-
 template <>
-void dispatchSparseConvolutionKernelMapGrad<torch::kCPU>(at::Tensor in_feat, at::Tensor grad_in_feat,
-                                                       at::Tensor grad_out_feat, at::Tensor kernel,
-                                                       at::Tensor grad_kernel, at::Tensor neighbor_map,
-                                                       at::Tensor neighbor_offset,
-                                                       bool transpose) {
+void
+dispatchSparseConvolutionKernelMapGrad<torch::kCPU>(at::Tensor in_feat, at::Tensor grad_in_feat,
+                                                    at::Tensor grad_out_feat, at::Tensor kernel,
+                                                    at::Tensor grad_kernel, at::Tensor neighbor_map,
+                                                    at::Tensor neighbor_offset, bool transpose) {
     grad_in_feat.resize_as_(in_feat);
     grad_in_feat.zero_();
     grad_kernel.resize_as_(kernel);
     grad_kernel.zero_();
 
-    int kernel_volume = kernel.size(0);
-    bool flag = false;
-    int in_buffer_size;
-    in_buffer_size =
-            *std::max_element(neighbor_offset.data_ptr<int>(),
-                              neighbor_offset.data_ptr<int>() + kernel_volume);
-
-    auto options =
-            torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
-    auto in_buffer = torch::zeros({in_buffer_size, in_feat.size(1)}, options);
-    auto in_grad_buffer =
-            torch::zeros({in_buffer_size, in_feat.size(1)}, options);
-    auto out_grad_buffer =
-            torch::zeros({in_buffer_size, kernel.size(2)}, options);
+    int  kernel_volume = kernel.size(0);
+    bool flag          = false;
+    int  in_buffer_size;
+    in_buffer_size = *std::max_element(neighbor_offset.data_ptr<int>(),
+                                       neighbor_offset.data_ptr<int>() + kernel_volume);
+
+    auto options         = torch::TensorOptions().dtype(in_feat.dtype()).device(in_feat.device());
+    auto in_buffer       = torch::zeros({ in_buffer_size, in_feat.size(1) }, options);
+    auto in_grad_buffer  = torch::zeros({ in_buffer_size, in_feat.size(1) }, options);
+    auto out_grad_buffer = torch::zeros({ in_buffer_size, kernel.size(2) }, options);
 
     int cur_offset = 0;
     for (int i = 0; i < kernel_volume; i++) {
@@ -482,55 +458,52 @@ void dispatchSparseConvolutionKernelMapGrad<torch::kCPU>(at::Tensor in_feat, at:
             continue;
         }
 
-        auto out_grad_buffer_activated = torch::from_blob(
-                out_grad_buffer.data_ptr(),
-                {neighbor_offset.data_ptr<int>()[i], kernel.size(2)}, options);
-        auto in_grad_buffer_activated = torch::from_blob(
-                in_grad_buffer.data_ptr(),
-                {neighbor_offset.data_ptr<int>()[i], in_feat.size(1)}, options);
+        auto out_grad_buffer_activated =
+            torch::from_blob(out_grad_buffer.data_ptr(),
+                             { neighbor_offset.data_ptr<int>()[i], kernel.size(2) }, options);
+        auto in_grad_buffer_activated =
+            torch::from_blob(in_grad_buffer.data_ptr(),
+                             { neighbor_offset.data_ptr<int>()[i], in_feat.size(1) }, options);
         auto in_buffer_activated = torch::from_blob(
-                in_buffer.data_ptr(),
-                {neighbor_offset.data_ptr<int>()[i], in_feat.size(1)}, options);
+            in_buffer.data_ptr(), { neighbor_offset.data_ptr<int>()[i], in_feat.size(1) }, options);
 
         // gather
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          grad_out_feat.scalar_type(), "gatherCpu", [&](){
-            gatherCpu(out_grad_buffer_activated.size(0), grad_out_feat.size(0),
-                       kernel.size(2), grad_out_feat.data_ptr<scalar_t>(),
-                       out_grad_buffer_activated.data_ptr<scalar_t>(),
-                       neighbor_map.data_ptr<int>() + cur_offset, !transpose);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, grad_out_feat.scalar_type(),
+            "gatherCpu", [&]() {
+                gatherCpu(out_grad_buffer_activated.size(0), grad_out_feat.size(0), kernel.size(2),
+                          grad_out_feat.data_ptr<scalar_t>(),
+                          out_grad_buffer_activated.data_ptr<scalar_t>(),
+                          neighbor_map.data_ptr<int>() + cur_offset, !transpose);
+            });
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          grad_out_feat.scalar_type(), "gatherCpu", [&](){
-            gatherCpu(in_buffer_activated.size(0), in_feat.size(0), kernel.size(1),
-                       in_feat.data_ptr<scalar_t>(), in_buffer_activated.data_ptr<scalar_t>(),
-                       neighbor_map.data_ptr<int>() + cur_offset, transpose);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, grad_out_feat.scalar_type(),
+            "gatherCpu", [&]() {
+                gatherCpu(in_buffer_activated.size(0), in_feat.size(0), kernel.size(1),
+                          in_feat.data_ptr<scalar_t>(), in_buffer_activated.data_ptr<scalar_t>(),
+                          neighbor_map.data_ptr<int>() + cur_offset, transpose);
+            });
 
         // matmul
         torch::mm_out(in_grad_buffer_activated, out_grad_buffer_activated,
                       torch::transpose(kernel[i], 0, 1));
-        torch::mm_out(kernel_grad_buffer,
-                      torch::transpose(in_buffer_activated, 0, 1),
+        torch::mm_out(kernel_grad_buffer, torch::transpose(in_buffer_activated, 0, 1),
                       out_grad_buffer_activated);
 
         // scatter
         AT_DISPATCH_FLOATING_TYPES_AND2(
-          at::ScalarType::Half, at::ScalarType::BFloat16,
-          grad_out_feat.scalar_type(), "scatterCpu", [&](){
-            scatterCpu(neighbor_offset.data_ptr<int>()[i], in_feat.size(0),
-                        kernel.size(1), in_grad_buffer_activated.data_ptr<scalar_t>(),
-                        grad_in_feat.data_ptr<scalar_t>(),
-                        neighbor_map.data_ptr<int>() + cur_offset, !transpose);
-        });
+            at::ScalarType::Half, at::ScalarType::BFloat16, grad_out_feat.scalar_type(),
+            "scatterCpu", [&]() {
+                scatterCpu(neighbor_offset.data_ptr<int>()[i], in_feat.size(0), kernel.size(1),
+                           in_grad_buffer_activated.data_ptr<scalar_t>(),
+                           grad_in_feat.data_ptr<scalar_t>(),
+                           neighbor_map.data_ptr<int>() + cur_offset, !transpose);
+            });
 
         cur_offset += 2 * neighbor_offset.data_ptr<int>()[i];
     }
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
index 2ec50c810a..5c191cc75c 100644
--- a/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
+++ b/fvdb/src/detail/ops/convolution/backend/SparseConvolutionLggs.cu
@@ -1,13 +1,18 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include "ConvOps.h"
+
+#include <detail/utils/cuda/Utils.cuh>
+
 #include <c10/cuda/CUDAException.h>
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/backend/ConvOps.h"
 
+// NOTE: Getting an error about duplicate definitions of `copy_if` if cute/tenosr.hpp is included
+// after other cute headers
 #include <cute/tensor.hpp>
-#include <cute/atom/mma_atom.hpp>
+
 #include <cute/atom/copy_atom.hpp>
+#include <cute/atom/mma_atom.hpp>
 
 namespace fvdb {
 namespace detail {
@@ -15,13 +20,12 @@ namespace ops {
 
 using namespace cute;
 
-struct ConvolutionFunctorV1
-{
-    static constexpr int Di = 128; // input channels (C)
-    static constexpr int Do = 128; // ouptut channels (K)
-    static constexpr int maxIndicesPerBlock = 64; // (linear NZPQ)
+struct ConvolutionFunctorV1 {
+    static constexpr int Di                 = 128; // input channels (C)
+    static constexpr int Do                 = 128; // ouptut channels (K)
+    static constexpr int maxIndicesPerBlock = 64;  // (linear NZPQ)
 
-    static constexpr int MaxThreadsPerBlock = 128;
+    static constexpr int MaxThreadsPerBlock         = 128;
     static constexpr int MinBlocksPerMultiprocessor = 1;
 
     struct SharedStorage {
@@ -30,57 +34,55 @@ struct ConvolutionFunctorV1
         float outputMatrixBlock[maxIndicesPerBlock][Do];
     };
 
-    using InputMatrixType = float (&) [][Di];
-    using OutputMatrixType = float (&) [][Do];
+    using InputMatrixType  = float (&)[][Di];
+    using OutputMatrixType = float (&)[][Do];
 
-    using SmemLayoutAtom = Layout<Shape <Int<maxIndicesPerBlock>,Int<Di>>,
-                                  Stride<Int<Di>, _1>>;
-    using SmemCopyAtom = Copy_Atom<DefaultCopy, tfloat32_t>;
+    using SmemLayoutAtom = Layout<Shape<Int<maxIndicesPerBlock>, Int<Di>>, Stride<Int<Di>, _1>>;
+    using SmemCopyAtom   = Copy_Atom<DefaultCopy, tfloat32_t>;
 
     void __device__
-    operator()(
-            float *deviceInputTensor,
-            float *deviceOutputTensor,
-            float *deviceStencil,
-            int numVoxels,
-            int32_t *deviceSpokeIndicesFlattenedOffset,
-            int32_t *deviceSpokeInputGlobalIndicesFlattenedData,
-            int32_t *deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData,
-            char *smem_buf) {
-        int b = blockIdx.x;
-        int tid = threadIdx.x;
-        SharedStorage& storage = *reinterpret_cast<SharedStorage *>(smem_buf);
-        auto& inputMatrix = storage.inputMatrix;
-        auto& outputMatrixSpoke = storage.outputMatrixSpoke;
-        auto& outputMatrixBlock = storage.outputMatrixBlock;
-
-        Tensor deviceSpokeIndicesFlattenedOffsetTensor = make_tensor(
-            make_gmem_ptr(deviceSpokeIndicesFlattenedOffset),
-            make_layout(make_shape (gridDim.x, make_shape(_3{},_3{},_3{})),
-                        make_stride(Int<27>{}, make_shape(_1{},_3{},_9{}))));
-        Tensor localDeviceSpokeIndicesFlattenedOffsetTensor = deviceSpokeIndicesFlattenedOffsetTensor(blockIdx.x, _);
-
-        using StencilLayout = decltype(make_ordered_layout(
-            Shape< Shape<_3,_3,_3>, Int<Di>, Int<Do>>{},
-            tuple< tuple<_2,_3,_4>,      _0,      _1>{}));
+    operator()(float *deviceInputTensor, float *deviceOutputTensor, float *deviceStencil,
+               int numVoxels, int32_t *deviceSpokeIndicesFlattenedOffset,
+               int32_t *deviceSpokeInputGlobalIndicesFlattenedData,
+               int32_t *deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData, char *smem_buf) {
+        int            b                 = blockIdx.x;
+        int            tid               = threadIdx.x;
+        SharedStorage &storage           = *reinterpret_cast<SharedStorage *>(smem_buf);
+        auto          &inputMatrix       = storage.inputMatrix;
+        auto          &outputMatrixSpoke = storage.outputMatrixSpoke;
+        auto          &outputMatrixBlock = storage.outputMatrixBlock;
+
+        Tensor deviceSpokeIndicesFlattenedOffsetTensor =
+            make_tensor(make_gmem_ptr(deviceSpokeIndicesFlattenedOffset),
+                        make_layout(make_shape(gridDim.x, make_shape(_3{}, _3{}, _3{})),
+                                    make_stride(Int<27>{}, make_shape(_1{}, _3{}, _9{}))));
+        Tensor localDeviceSpokeIndicesFlattenedOffsetTensor =
+            deviceSpokeIndicesFlattenedOffsetTensor(blockIdx.x, _);
+
+        using StencilLayout   = decltype(make_ordered_layout(
+            Shape<Shape<_3, _3, _3>, Int<Di>, Int<Do>>{}, tuple<tuple<_2, _3, _4>, _0, _1>{}));
         Tensor stencil_tensor = make_tensor(make_gmem_ptr(deviceStencil), StencilLayout{});
 
-        InputMatrixType inputTensor = reinterpret_cast<InputMatrixType>(*deviceInputTensor);
+        InputMatrixType  inputTensor  = reinterpret_cast<InputMatrixType>(*deviceInputTensor);
         OutputMatrixType outputTensor = reinterpret_cast<OutputMatrixType>(*deviceOutputTensor);
 
-        for ( int e = 0; e < maxIndicesPerBlock; e++ ) {
+        for (int e = 0; e < maxIndicesPerBlock; e++) {
             outputMatrixBlock[e][tid] = 0.;
         }
 
         __syncthreads();
 
-        // for every spoke
-        #pragma nounroll
+// for every spoke
+#pragma nounroll
         for (int k_tile_iter = 0; k_tile_iter < size<0>(stencil_tensor); ++k_tile_iter) {
-            auto stencil_slice = stencil_tensor(k_tile_iter,_,_);
-            const auto spokeGlobalIndicesBegin = localDeviceSpokeIndicesFlattenedOffsetTensor(k_tile_iter);
-            const auto spokeIndicesCount = localDeviceSpokeIndicesFlattenedOffsetTensor(k_tile_iter+1) - spokeGlobalIndicesBegin;
-            const auto spokeInputGlobalIndices = &deviceSpokeInputGlobalIndicesFlattenedData[spokeGlobalIndicesBegin];
+            auto       stencil_slice = stencil_tensor(k_tile_iter, _, _);
+            const auto spokeGlobalIndicesBegin =
+                localDeviceSpokeIndicesFlattenedOffsetTensor(k_tile_iter);
+            const auto spokeIndicesCount =
+                localDeviceSpokeIndicesFlattenedOffsetTensor(k_tile_iter + 1) -
+                spokeGlobalIndicesBegin;
+            const auto spokeInputGlobalIndices =
+                &deviceSpokeInputGlobalIndicesFlattenedData[spokeGlobalIndicesBegin];
             const auto spokeOutputLocalOffsetsRelativeToBlock =
                 &deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData[spokeGlobalIndicesBegin];
 
@@ -99,28 +101,26 @@ struct ConvolutionFunctorV1
                 auto sA_128b = recast<float4>(sA);
                 if (e < spokeIndicesCount) {
                     auto inputIndex = spokeInputGlobalIndices[e];
-                    if (inputIndex < 0) {       // Won't happen if not locally densified
-                        sA_128b(e,tid_k) = float4{0,0,0,0};
+                    if (inputIndex < 0) { // Won't happen if not locally densified
+                        sA_128b(e, tid_k) = float4{ 0, 0, 0, 0 };
                         continue;
                     }
                     Tensor input_tensor_slice = make_tensor(
-                        make_gmem_ptr(reinterpret_cast<float4*>(&inputTensor[inputIndex])),
-                        make_layout(make_shape(_32{}))
-                    );
-                    sA_128b(e,tid_k) = input_tensor_slice(tid_k);
+                        make_gmem_ptr(reinterpret_cast<float4 *>(&inputTensor[inputIndex])),
+                        make_layout(make_shape(_32{})));
+                    sA_128b(e, tid_k) = input_tensor_slice(tid_k);
                 } else {
-                    sA_128b(e,tid_k) = float4{0,0,0,0};
+                    sA_128b(e, tid_k) = float4{ 0, 0, 0, 0 };
                 }
             }
             __syncthreads();
 
             if (spokeIndicesCount <= 16) {
-                using _M = _16; // == maxIndicesPerBlock
+                using _M = _16;  // == maxIndicesPerBlock
                 using _N = _128; // == Do
                 using _K = _128; // == Di
-                using TiledMma = TiledMMA<
-                    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
-                    Layout<Shape<_2, _2, _1>>>;
+                using TiledMma =
+                    TiledMMA<MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>, Layout<Shape<_2, _2, _1>>>;
 
                 Tensor sA = make_tensor(make_smem_ptr(&inputMatrix[0][0]),
                                         make_layout(make_shape(_M{}, _K{}), GenRowMajor{}));
@@ -129,7 +129,7 @@ struct ConvolutionFunctorV1
                                         make_layout(make_shape(_M{}, _N{}), GenRowMajor{}));
 
                 TiledMma tiled_mma;
-                Tensor accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
+                Tensor   accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
 
                 auto thr_mma = tiled_mma.get_thread_slice(threadIdx.x);
 
@@ -147,14 +147,12 @@ struct ConvolutionFunctorV1
 
                 Tensor tCsC = thr_mma.partition_C(sC);
                 copy(accum, tCsC);
-            }
-            else if (spokeIndicesCount > 16 && spokeIndicesCount <= 32) {
-                using _M = _32; // == maxIndicesPerBlock
+            } else if (spokeIndicesCount > 16 && spokeIndicesCount <= 32) {
+                using _M = _32;  // == maxIndicesPerBlock
                 using _N = _128; // == Do
                 using _K = _128; // == Di
-                using TiledMma = TiledMMA<
-                    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
-                    Layout<Shape<_2, _2, _1>>>;
+                using TiledMma =
+                    TiledMMA<MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>, Layout<Shape<_2, _2, _1>>>;
 
                 Tensor sA = make_tensor(make_smem_ptr(&inputMatrix[0][0]),
                                         make_layout(make_shape(_M{}, _K{}), GenRowMajor{}));
@@ -163,7 +161,7 @@ struct ConvolutionFunctorV1
                                         make_layout(make_shape(_M{}, _N{}), GenRowMajor{}));
 
                 TiledMma tiled_mma;
-                Tensor accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
+                Tensor   accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
 
                 auto thr_mma = tiled_mma.get_thread_slice(threadIdx.x);
 
@@ -181,14 +179,12 @@ struct ConvolutionFunctorV1
 
                 Tensor tCsC = thr_mma.partition_C(sC);
                 copy(accum, tCsC);
-            }
-            else if (spokeIndicesCount <= 64) {
-                using _M = _64; // == maxIndicesPerBlock
+            } else if (spokeIndicesCount <= 64) {
+                using _M = _64;  // == maxIndicesPerBlock
                 using _N = _128; // == Do
                 using _K = _128; // == Di
-                using TiledMma = TiledMMA<
-                    MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>,
-                    Layout<Shape<_2, _2, _1>>>;
+                using TiledMma =
+                    TiledMMA<MMA_Atom<SM80_16x8x8_F32TF32TF32F32_TN>, Layout<Shape<_2, _2, _1>>>;
 
                 Tensor sA = make_tensor(make_smem_ptr(&inputMatrix[0][0]),
                                         make_layout(make_shape(_M{}, _K{}), GenRowMajor{}));
@@ -197,7 +193,7 @@ struct ConvolutionFunctorV1
                                         make_layout(make_shape(_M{}, _N{}), GenRowMajor{}));
 
                 TiledMma tiled_mma;
-                Tensor accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
+                Tensor   accum = partition_fragment_C(tiled_mma, Shape<_M, _N>{});
 
                 auto thr_mma = tiled_mma.get_thread_slice(threadIdx.x);
 
@@ -221,7 +217,7 @@ struct ConvolutionFunctorV1
 
             for (int e = 0; e < spokeIndicesCount; e++) {
                 const auto blockOutputIndex = spokeOutputLocalOffsetsRelativeToBlock[e];
-                if (blockOutputIndex < 0) {       // Won't happen if not locally densified
+                if (blockOutputIndex < 0) { // Won't happen if not locally densified
                     continue;
                 }
                 outputMatrixBlock[blockOutputIndex][tid] += outputMatrixSpoke[e][tid];
@@ -231,87 +227,66 @@ struct ConvolutionFunctorV1
         __syncthreads();
 
         const auto rStart = b * maxIndicesPerBlock;
-        const auto rEnd = min((b+1) * maxIndicesPerBlock, numVoxels);
+        const auto rEnd   = min((b + 1) * maxIndicesPerBlock, numVoxels);
         for (int e = 0; e < (rEnd - rStart); e++) {
             outputTensor[e + rStart][tid] = outputMatrixBlock[e][tid];
         }
     }
 };
 
-
-template<class Operator>
+template <class Operator>
 __global__
-__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor)
-void kernel_entrypoint(
-    float *deviceInputTensor,
-    float *deviceOutputTensor,
-    float *deviceStencil,
-    int numVoxels,
-    int32_t *deviceSpokeIndicesFlattenedOffset,
-    int32_t *deviceSpokeInputGlobalIndicesFlattenedData,
-    int32_t *deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData)
-{
+__launch_bounds__(Operator::MaxThreadsPerBlock, Operator::MinBlocksPerMultiprocessor) void kernel_entrypoint(
+    float *deviceInputTensor, float *deviceOutputTensor, float *deviceStencil, int numVoxels,
+    int32_t *deviceSpokeIndicesFlattenedOffset, int32_t *deviceSpokeInputGlobalIndicesFlattenedData,
+    int32_t *deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData) {
     extern __shared__ char smem_buf[];
-    Operator op;
-    op(
-        deviceInputTensor,
-        deviceOutputTensor,
-        deviceStencil,
-        numVoxels,
-        deviceSpokeIndicesFlattenedOffset,
-        deviceSpokeInputGlobalIndicesFlattenedData,
-        deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData,
-        smem_buf
-    );
+    Operator               op;
+    op(deviceInputTensor, deviceOutputTensor, deviceStencil, numVoxels,
+       deviceSpokeIndicesFlattenedOffset, deviceSpokeInputGlobalIndicesFlattenedData,
+       deviceSpokeOutputLocalOffsetsRelativeToBlockFlattenedData, smem_buf);
 }
 
 template <>
-torch::Tensor dispatchSparseConvolutionLggs<torch::kCUDA>(
-    const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-    const torch::Tensor& spokeIndicesFlattenedOffset,
-    const torch::Tensor& spokeInputGlobalIndicesFlattenedData,
-    const torch::Tensor& spokeOutputLocalOffsetsRelativeToBlockFlattenedData
-) {
+torch::Tensor
+dispatchSparseConvolutionLggs<torch::kCUDA>(
+    const torch::Tensor &inFeatures, const torch::Tensor &kernel,
+    const torch::Tensor &spokeIndicesFlattenedOffset,
+    const torch::Tensor &spokeInputGlobalIndicesFlattenedData,
+    const torch::Tensor &spokeOutputLocalOffsetsRelativeToBlockFlattenedData) {
     using Op = ConvolutionFunctorV1;
 
     // Assuming kernel is reshaped from [Do, Di, D, H, W] to [D*H*W, Di, Do]
     const int inC = kernel.size(1), outC = kernel.size(2);
     TORCH_CHECK(inC == 128 && outC == 128, "ConvolutionFunctorV1 only supports 128x128 kernels");
 
-    torch::Tensor outFeatures = torch::empty({inFeatures.size(0), outC}, inFeatures.options());
+    torch::Tensor outFeatures = torch::empty({ inFeatures.size(0), outC }, inFeatures.options());
 
     constexpr size_t smem_size = sizeof(typename Op::SharedStorage);
-    cudaFuncSetAttribute(
-        kernel_entrypoint<Op>,
-        cudaFuncAttributeMaxDynamicSharedMemorySize,
-        smem_size);
+    cudaFuncSetAttribute(kernel_entrypoint<Op>, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                         smem_size);
 
     int blockCount = (spokeIndicesFlattenedOffset.size(0) - 1) / 27;
-    kernel_entrypoint<Op>
-        <<<size_t(blockCount), Op::MaxThreadsPerBlock, smem_size>>>(
-            inFeatures.data_ptr<float>(),
-            outFeatures.data_ptr<float>(),
-            kernel.data_ptr<float>(),
-            inFeatures.size(0),
-            spokeIndicesFlattenedOffset.data_ptr<int>(),
-            spokeInputGlobalIndicesFlattenedData.data_ptr<int>(),
-            spokeOutputLocalOffsetsRelativeToBlockFlattenedData.data_ptr<int>()
-        );
+    kernel_entrypoint<Op><<<size_t(blockCount), Op::MaxThreadsPerBlock, smem_size>>>(
+        inFeatures.data_ptr<float>(), outFeatures.data_ptr<float>(), kernel.data_ptr<float>(),
+        inFeatures.size(0), spokeIndicesFlattenedOffset.data_ptr<int>(),
+        spokeInputGlobalIndicesFlattenedData.data_ptr<int>(),
+        spokeOutputLocalOffsetsRelativeToBlockFlattenedData.data_ptr<int>());
     C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     return outFeatures;
 }
 
 template <>
-torch::Tensor dispatchSparseConvolutionLggs<torch::kCPU>(
-    const torch::Tensor& inFeatures, const torch::Tensor& kernel,
-    const torch::Tensor& spokeIndicesFlattenedOffset,
-    const torch::Tensor& spokeInputGlobalIndicesFlattenedData,
-    const torch::Tensor& spokeOutputLocalOffsetsRelativeToBlockFlattenedData
-) {
+torch::Tensor
+dispatchSparseConvolutionLggs<torch::kCPU>(
+    const torch::Tensor &inFeatures, const torch::Tensor &kernel,
+    const torch::Tensor &spokeIndicesFlattenedOffset,
+    const torch::Tensor &spokeInputGlobalIndicesFlattenedData,
+    const torch::Tensor &spokeOutputLocalOffsetsRelativeToBlockFlattenedData) {
     TORCH_CHECK(false, "SparseConvolutionLggs is not implemented for CPU");
 }
 
-}   // namespace ops
-}   // namespace detail
-}   // namespace fvdb
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu b/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
index 361e4ae970..1fa5cd239e 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/BrickHaloBuffer.cu
@@ -1,68 +1,65 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#include "PackInfoOps.h"
+
+#include <Types.h>
+#include <detail/utils/cuda/Utils.cuh>
+
+#include <ATen/OpMathType.h>
 #include <c10/cuda/CUDAException.h>
-#include <c10/util/Half.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <ATen/OpMathType.h>
+#include <c10/util/Half.h>
 #include <cute/tensor.hpp>
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/pack_info/PackInfoOps.h"
-#include "Types.h"
-
-
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 namespace {
-    using halo_brick_shape = decltype(cute::make_shape(cute::_6{}, cute::_4{}, cute::_4{}));
-    using brick_shape = decltype(cute::make_shape(cute::_4{}, cute::_2{}, cute::_2{}));
-    using halo_index_buffer_layout = decltype(cute::make_layout(cute::insert<0>(halo_brick_shape{}, 0), cute::GenRowMajor{}));
-    using output_index_buffer_layout = decltype(cute::make_layout(cute::insert<0>(brick_shape{}, 0), cute::GenRowMajor{}));
-}
+using halo_brick_shape = decltype(cute::make_shape(cute::_6{}, cute::_4{}, cute::_4{}));
+using brick_shape      = decltype(cute::make_shape(cute::_4{}, cute::_2{}, cute::_2{}));
+using halo_index_buffer_layout =
+    decltype(cute::make_layout(cute::insert<0>(halo_brick_shape{}, 0), cute::GenRowMajor{}));
+using output_index_buffer_layout =
+    decltype(cute::make_layout(cute::insert<0>(brick_shape{}, 0), cute::GenRowMajor{}));
+} // namespace
 
 // There are 32 4x2x2 bricks in a leaf node
 // This returns the base i,j,k based on laying out the bricks lexicographically,
 // in a k-major ordering.
-__host__ __device__
-auto base_ijk_for_brick_in_leaf(int b_id)
-{
+__host__ __device__ auto
+base_ijk_for_brick_in_leaf(int b_id) {
     return cute::make_arithmetic_tuple((b_id & 16) >> 2, (b_id & 12) >> 1, (b_id & 3) << 1);
 }
 
-template<typename ValueMask>
-__host__ __device__
-inline uint64_t get_active_mask(const ValueMask& valueMask, const cute::ArithmeticTuple<int, int, int>& B)
-{
+template <typename ValueMask>
+__host__ __device__ inline uint64_t
+get_active_mask(const ValueMask &valueMask, const cute::ArithmeticTuple<int, int, int> &B) {
     const auto [Bi, Bj, Bk] = B;
-    uint64_t active_mask =
-        valueMask.words()[Bi] |
-        valueMask.words()[Bi + 1] |
-        valueMask.words()[Bi + 2] |
-        valueMask.words()[Bi + 3];
+    uint64_t active_mask    = valueMask.words()[Bi] | valueMask.words()[Bi + 1] |
+                           valueMask.words()[Bi + 2] | valueMask.words()[Bi + 3];
     active_mask &= (0x303UL << Bk) << (Bj * 8);
     return active_mask;
 }
 
 template <typename GridType>
-__global__ __launch_bounds__(32) void mark_bricks(
-        BatchGridAccessor<GridType> gridAcc,
-        TorchRAcc32<uint8_t, 1> brick_usage_flags) {
+__global__
+__launch_bounds__(32) void mark_bricks(BatchGridAccessor<GridType> gridAcc,
+                                       TorchRAcc32<uint8_t, 1>     brick_usage_flags) {
     using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
 
-    const auto brickId = blockDim.x * blockIdx.x + threadIdx.x;
+    const auto brickId       = blockDim.x * blockIdx.x + threadIdx.x;
     const auto brickInLeafId = threadIdx.x;
-    const auto leafIdx = blockIdx.x;
+    const auto leafIdx       = blockIdx.x;
 
-    const int64_t batchIdx = gridAcc.leafBatchIndex(leafIdx);
+    const int64_t batchIdx     = gridAcc.leafBatchIndex(leafIdx);
     const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
 
-    const nanovdb::NanoGrid<GridType>* deviceGrid = gridAcc.grid(batchIdx);
-    const LeafNodeType& leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
+    const nanovdb::NanoGrid<GridType> *deviceGrid = gridAcc.grid(batchIdx);
+    const LeafNodeType &leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
 
-    const auto& valueMask = leaf.valueMask();
+    const auto    &valueMask = leaf.valueMask();
     const uint64_t active_mask =
         get_active_mask(valueMask, base_ijk_for_brick_in_leaf(brickInLeafId));
 
@@ -70,10 +67,9 @@ __global__ __launch_bounds__(32) void mark_bricks(
 }
 
 template <int BATCH>
-__device__
-auto offset_from_tiwid(int tiwid)
-{
-    static_assert(BATCH >=0 && BATCH < 3);
+__device__ auto
+offset_from_tiwid(int tiwid) {
+    static_assert(BATCH >= 0 && BATCH < 3);
     const int n = BATCH * 32 + tiwid;
     const int k = n & 3;
     const int j = (n >> 2) & 3;
@@ -83,29 +79,26 @@ auto offset_from_tiwid(int tiwid)
 
 template <typename GridType>
 __global__ __launch_bounds__(1024) void populate_halo_index_buffer(
-        BatchGridAccessor<GridType> gridAcc,
-        TorchRAcc32<uint8_t, 1> brick_usage_flags,
-        TorchRAcc32<int, 1> brick_offsets,
-        int *halo_index_buffer,
-        int *output_index_buffer,
-        bool benchmark) {         // Use raw pointer and templated cute to accelerate pointer arithmetic
+    BatchGridAccessor<GridType> gridAcc, TorchRAcc32<uint8_t, 1> brick_usage_flags,
+    TorchRAcc32<int, 1> brick_offsets, int *halo_index_buffer, int *output_index_buffer,
+    bool benchmark) { // Use raw pointer and templated cute to accelerate pointer arithmetic
 
     using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
 
     const int leafIdx = blockIdx.x;
-    const int tid = threadIdx.x;
-    const int wid = threadIdx.x / 32;      // == brick ID in leaf
-    const int tiwid = threadIdx.x % 32;    // thread in warp
+    const int tid     = threadIdx.x;
+    const int wid     = threadIdx.x / 32;   // == brick ID in leaf
+    const int tiwid   = threadIdx.x % 32;   // thread in warp
     const int brickId = 32 * leafIdx + wid; // This is the global brick ID
 
-    const int64_t batchIdx = gridAcc.leafBatchIndex(leafIdx);
+    const int64_t batchIdx     = gridAcc.leafBatchIndex(leafIdx);
     const int64_t localLeafIdx = leafIdx - gridAcc.leafOffset(batchIdx);
-    const int64_t baseOffset = gridAcc.voxelOffset(batchIdx);
+    const int64_t baseOffset   = gridAcc.voxelOffset(batchIdx);
 
-    const nanovdb::NanoGrid<GridType>* deviceGrid = gridAcc.grid(batchIdx);
-    const LeafNodeType& leaf = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
+    const nanovdb::NanoGrid<GridType> *deviceGrid = gridAcc.grid(batchIdx);
+    const LeafNodeType  &leaf   = deviceGrid->tree().template getFirstNode<0>()[localLeafIdx];
     const nanovdb::Coord origin = leaf.origin();
-    auto deviceGridAcc = deviceGrid->getAccessor();
+    auto                 deviceGridAcc = deviceGrid->getAccessor();
 
     __shared__ uint32_t sHaloBuffer[10][10][10];
 
@@ -118,10 +111,9 @@ __global__ __launch_bounds__(1024) void populate_halo_index_buffer(
 
         // NOTE: Put 0 for inactive indices and shift feature by 1.
         if (deviceGridAcc.template get<ActiveOrUnmasked<GridType>>(coord)) {
-            const int offset = benchmark ? -1 : 0;
+            const int offset       = benchmark ? -1 : 0;
             sHaloBuffer[0][0][tid] = deviceGridAcc.getValue(coord) + baseOffset + offset;
-        }
-        else {
+        } else {
             sHaloBuffer[0][0][tid] = 0;
         }
     }
@@ -129,19 +121,20 @@ __global__ __launch_bounds__(1024) void populate_halo_index_buffer(
 
     using cute::_;
 
-    if (!brick_usage_flags[brickId]) return;
-    static const auto full_halo_layout = cute::make_layout(cute::Shape<cute::_10, cute::_10, cute::_10>{}, cute::GenRowMajor{});
+    if (!brick_usage_flags[brickId])
+        return;
+    static const auto full_halo_layout =
+        cute::make_layout(cute::Shape<cute::_10, cute::_10, cute::_10>{}, cute::GenRowMajor{});
 
-    cute::Tensor s_full_halo = cute::make_tensor(cute::make_smem_ptr(&sHaloBuffer[0][0][0]), full_halo_layout);
+    cute::Tensor s_full_halo =
+        cute::make_tensor(cute::make_smem_ptr(&sHaloBuffer[0][0][0]), full_halo_layout);
 
-    cute::Tensor g_halo_index_buffer = cute::make_tensor(
-        cute::make_gmem_ptr(halo_index_buffer), halo_index_buffer_layout{}
-    );
+    cute::Tensor g_halo_index_buffer =
+        cute::make_tensor(cute::make_gmem_ptr(halo_index_buffer), halo_index_buffer_layout{});
     cute::Tensor g_brick_halo = g_halo_index_buffer(brick_offsets[brickId], _, _, _);
 
-    cute::Tensor g_output_index_buffer = cute::make_tensor(
-        cute::make_gmem_ptr(output_index_buffer), output_index_buffer_layout{}
-    );
+    cute::Tensor g_output_index_buffer =
+        cute::make_tensor(cute::make_gmem_ptr(output_index_buffer), output_index_buffer_layout{});
     cute::Tensor g_output_brick = g_output_index_buffer(brick_offsets[brickId], _, _, _);
 
     // Note the full aritmethic is this:
@@ -149,82 +142,75 @@ __global__ __launch_bounds__(1024) void populate_halo_index_buffer(
     // real_halo_offset = offset_ijk - 1
     // halo(real_halo_offset + 1) = sHaloBuffer(base_ijk_in_halo_buffer + real_halo_offset)
     // i.e the +1s and -1s cancel out
-    const auto ones = cute::make_arithmetic_tuple(1, 1, 1);
+    const auto ones     = cute::make_arithmetic_tuple(1, 1, 1);
     const auto base_ijk = base_ijk_for_brick_in_leaf(wid);
 
-    const auto offset_ijk0 = offset_from_tiwid<0>(tiwid);
+    const auto offset_ijk0    = offset_from_tiwid<0>(tiwid);
     g_brick_halo(offset_ijk0) = s_full_halo(base_ijk + offset_ijk0);
 
-    const auto offset_ijk1 = offset_from_tiwid<1>(tiwid);
+    const auto offset_ijk1    = offset_from_tiwid<1>(tiwid);
     g_brick_halo(offset_ijk1) = s_full_halo(base_ijk + offset_ijk1);
 
-    const auto offset_ijk2 = offset_from_tiwid<2>(tiwid);
+    const auto offset_ijk2    = offset_from_tiwid<2>(tiwid);
     g_brick_halo(offset_ijk2) = s_full_halo(base_ijk + offset_ijk2);
 
     if (tiwid < 16) {
-        const auto output_crd = cute::idx2crd(tiwid, brick_shape{});
+        const auto output_crd      = cute::idx2crd(tiwid, brick_shape{});
         g_output_brick(output_crd) = s_full_halo(base_ijk + ones + output_crd);
     }
 }
 
 template <>
-std::vector<torch::Tensor> dispatchBrickHaloBuffer<torch::kCUDA>(const GridBatchImpl& batchHdl, bool benchmark) {
-
+std::vector<torch::Tensor>
+dispatchBrickHaloBuffer<torch::kCUDA>(const GridBatchImpl &batchHdl, bool benchmark) {
     const size_t num_leaf_nodes = batchHdl.totalLeaves();
-    const size_t num_bricks = num_leaf_nodes * 32;
-    const size_t TileM = 4;
+    const size_t num_bricks     = num_leaf_nodes * 32;
+    const size_t TileM          = 4;
 
     // (num_total_bricks,) either 0 or 1 indicating if the brick is occupied
     torch::Tensor brick_usage_flags = torch::empty(
-        {(int64_t) num_bricks}, torch::dtype(torch::kUInt8).device(batchHdl.device()));
+        { (int64_t)num_bricks }, torch::dtype(torch::kUInt8).device(batchHdl.device()));
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         auto batchAcc = gridBatchAccessor<torch::kCUDA, GridType>(batchHdl);
         mark_bricks<GridType><<<num_leaf_nodes, 32>>>(
-            batchAcc,
-            brick_usage_flags.packed_accessor32<uint8_t, 1, torch::RestrictPtrTraits>()
-        );
+            batchAcc, brick_usage_flags.packed_accessor32<uint8_t, 1, torch::RestrictPtrTraits>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
 
     // (num_total_bricks,) the index of the brick in the halo buffer
     // values in non-occupied bricks are undefined!
-    torch::Tensor brick_offsets = torch::cumsum(brick_usage_flags, 0, torch::kInt32) - 1;
+    torch::Tensor brick_offsets     = torch::cumsum(brick_usage_flags, 0, torch::kInt32) - 1;
     const int32_t num_active_bricks = brick_offsets[-1].item<int32_t>() + 1;
     const size_t num_active_bricks_with_padding = ((num_active_bricks + TileM - 1) / TileM) * TileM;
 
     // (#active_brick, 6, 4, 4) the index of the brick in the halo buffer
     // (#active_brick, 4, 2, 2) the index of the brick in the output (sub-portion of the above)
-    torch::Tensor halo_index_buffer = torch::zeros(
-        {(int64_t) num_active_bricks_with_padding * size(halo_brick_shape{})},
-        torch::dtype(torch::kInt32).device(batchHdl.device()));
-    torch::Tensor output_index_buffer = torch::zeros(
-        {(int64_t) num_active_bricks_with_padding * size(brick_shape{})},
-        torch::dtype(torch::kInt32).device(batchHdl.device()));
+    torch::Tensor halo_index_buffer =
+        torch::zeros({ (int64_t)num_active_bricks_with_padding * size(halo_brick_shape{}) },
+                     torch::dtype(torch::kInt32).device(batchHdl.device()));
+    torch::Tensor output_index_buffer =
+        torch::zeros({ (int64_t)num_active_bricks_with_padding * size(brick_shape{}) },
+                     torch::dtype(torch::kInt32).device(batchHdl.device()));
 
     FVDB_DISPATCH_GRID_TYPES(batchHdl, [&]() {
         auto batchAcc = gridBatchAccessor<torch::kCUDA, GridType>(batchHdl);
         populate_halo_index_buffer<GridType><<<num_leaf_nodes, 1024>>>(
-            batchAcc,
-            brick_usage_flags.packed_accessor32<uint8_t, 1, torch::RestrictPtrTraits>(),
+            batchAcc, brick_usage_flags.packed_accessor32<uint8_t, 1, torch::RestrictPtrTraits>(),
             brick_offsets.packed_accessor32<int, 1, torch::RestrictPtrTraits>(),
-            halo_index_buffer.data_ptr<int>(),
-            output_index_buffer.data_ptr<int>(),
-            benchmark
-        );
+            halo_index_buffer.data_ptr<int>(), output_index_buffer.data_ptr<int>(), benchmark);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     });
 
-    return {brick_offsets, halo_index_buffer, output_index_buffer};
+    return { brick_offsets, halo_index_buffer, output_index_buffer };
 }
 
-
 template <>
-std::vector<torch::Tensor> dispatchBrickHaloBuffer<torch::kCPU>(const GridBatchImpl& batchHdl, bool benchmark) {
+std::vector<torch::Tensor>
+dispatchBrickHaloBuffer<torch::kCPU>(const GridBatchImpl &batchHdl, bool benchmark) {
     TORCH_CHECK(false, "CPU not supported");
 }
 
-
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
diff --git a/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu b/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
index f4fb618bb6..4e9c84a5b1 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/ConvolutionKernelMap.cu
@@ -1,87 +1,92 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <THC/THCAtomics.cuh>
-#include <c10/cuda/CUDAException.h>
+#include "PackInfoOps.h"
 
 #include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/convolution/pack_info/PackInfoOps.h"
 #include "detail/utils/nanovdb/CustomAccessors.h"
 
+#include <c10/cuda/CUDAException.h>
+#include <THC/THCAtomics.cuh>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <typename GridType>
-__hostdev__ inline void convolutionKernelMapVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx, int32_t channelIdx,
-                                                          BatchGridAccessor<GridType> targetBatchAcc,
-                                                          BatchGridAccessor<GridType> sourceBatchAcc,
-                                                          TorchRAcc32<int32_t, 2> kmap,
-                                                          const nanovdb::Coord& kernelStart, const nanovdb::Coord& kernelSize,
-                                                          const nanovdb::Coord& stride) {
+__hostdev__ inline void
+convolutionKernelMapVoxelCallback(int32_t batchIdx, int32_t leafIdx, int32_t voxelIdx,
+                                  int32_t channelIdx, BatchGridAccessor<GridType> targetBatchAcc,
+                                  BatchGridAccessor<GridType> sourceBatchAcc,
+                                  TorchRAcc32<int32_t, 2> kmap, const nanovdb::Coord &kernelStart,
+                                  const nanovdb::Coord &kernelSize, const nanovdb::Coord &stride) {
     using LeafNodeType = typename nanovdb::NanoTree<GridType>::LeafNodeType;
 
-    const nanovdb::NanoGrid<GridType>* source = sourceBatchAcc.grid(batchIdx);
-    const nanovdb::NanoGrid<GridType>* target = targetBatchAcc.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *source = sourceBatchAcc.grid(batchIdx);
+    const nanovdb::NanoGrid<GridType> *target = targetBatchAcc.grid(batchIdx);
 
-    const LeafNodeType& leaf = target->tree().template getFirstNode<0>()[leafIdx];
-    auto sourceAcc = source->getAccessor();
+    const LeafNodeType &leaf      = target->tree().template getFirstNode<0>()[leafIdx];
+    auto                sourceAcc = source->getAccessor();
 
     const int64_t targetBaseOffset = targetBatchAcc.voxelOffset(batchIdx);
     const int64_t sourceBaseOffset = sourceBatchAcc.voxelOffset(batchIdx);
 
     const nanovdb::Coord tCoord = leaf.offsetToGlobalCoord(voxelIdx);
-    const nanovdb::Coord sCoord(tCoord.x() * stride.z(), tCoord.y() * stride.y(), tCoord.z() * stride.x());
+    const nanovdb::Coord sCoord(tCoord.x() * stride.z(), tCoord.y() * stride.y(),
+                                tCoord.z() * stride.x());
 
-    const int kIdx = channelIdx; // = kx + ky * ks[0] + kz * ks[1] * ks[0];
-    auto ks0ks1 = kernelSize.x() * kernelSize.y();
-    const int kz = kernelStart.z() + (channelIdx / ks0ks1);
-    const int ky = kernelStart.y() + ((channelIdx % ks0ks1) / kernelSize.x());
-    const int kx = kernelStart.x() + ((channelIdx % ks0ks1) % kernelSize.x());
+    const int kIdx   = channelIdx; // = kx + ky * ks[0] + kz * ks[1] * ks[0];
+    auto      ks0ks1 = kernelSize.x() * kernelSize.y();
+    const int kz     = kernelStart.z() + (channelIdx / ks0ks1);
+    const int ky     = kernelStart.y() + ((channelIdx % ks0ks1) / kernelSize.x());
+    const int kx     = kernelStart.x() + ((channelIdx % ks0ks1) % kernelSize.x());
 
     if (leaf.template get<ActiveOrUnmasked<GridType>>(voxelIdx)) {
         const nanovdb::Coord sOffset = sCoord + nanovdb::Coord(kz, ky, kx);
         if (sourceAcc.template get<ActiveOrUnmasked<GridType>>(sOffset)) {
-            kmap[targetBaseOffset + leaf.getValue(voxelIdx) - 1][kIdx] = sourceBaseOffset + sourceAcc.getValue(sOffset) - 1;
+            kmap[targetBaseOffset + leaf.getValue(voxelIdx) - 1][kIdx] =
+                sourceBaseOffset + sourceAcc.getValue(sOffset) - 1;
         } else {
             kmap[targetBaseOffset + leaf.getValue(voxelIdx) - 1][kIdx] = -1;
         }
     }
 }
 
-
 template <typename GridType>
-void convolutionKernelMapCPU(const GridBatchImpl::Accessor<GridType>& sourceGridBatchAcc,
-                             const GridBatchImpl::Accessor<GridType>& targetGridBatchAcc,
-                             const nanovdb::Coord& kernelSize,
-                             const nanovdb::Coord& stride,
-                             torch::TensorAccessor<int, 2> outKernelMap) {
-
-    const nanovdb::Coord kernelStart ({(int) std::floor(-kernelSize.x() / 2.0 + 1),
-                                       (int) std::floor(-kernelSize.y() / 2.0 + 1),
-                                       (int) std::floor(-kernelSize.z() / 2.0 + 1) });
+void
+convolutionKernelMapCPU(const GridBatchImpl::Accessor<GridType> &sourceGridBatchAcc,
+                        const GridBatchImpl::Accessor<GridType> &targetGridBatchAcc,
+                        const nanovdb::Coord &kernelSize, const nanovdb::Coord &stride,
+                        torch::TensorAccessor<int, 2> outKernelMap) {
+    const nanovdb::Coord kernelStart({ (int)std::floor(-kernelSize.x() / 2.0 + 1),
+                                       (int)std::floor(-kernelSize.y() / 2.0 + 1),
+                                       (int)std::floor(-kernelSize.z() / 2.0 + 1) });
 
     for (size_t bi = 0; bi < sourceGridBatchAcc.batchSize(); bi += 1) {
-        const auto* sourceGrid = sourceGridBatchAcc.grid(bi);
-        const auto* targetGrid = targetGridBatchAcc.grid(bi);
+        const auto *sourceGrid = sourceGridBatchAcc.grid(bi);
+        const auto *targetGrid = targetGridBatchAcc.grid(bi);
 
         const int64_t targetBaseOffset = targetGridBatchAcc.voxelOffset(bi);
         const int64_t sourceBaseOffset = sourceGridBatchAcc.voxelOffset(bi);
 
         auto sourceAcc = sourceGrid->getAccessor();
 
-        for (auto it = ActiveVoxelIterator<GridType, -1>(targetGrid->tree(), false, targetBaseOffset); it.isValid(); it++) {
+        for (auto it =
+                 ActiveVoxelIterator<GridType, -1>(targetGrid->tree(), false, targetBaseOffset);
+             it.isValid(); it++) {
             // Note that stride and kernelSize is in DHW
-            const nanovdb::Coord sCoord(it->first.x() * stride.z(), it->first.y() * stride.y(), it->first.z() * stride.x());
+            const nanovdb::Coord sCoord(it->first.x() * stride.z(), it->first.y() * stride.y(),
+                                        it->first.z() * stride.x());
             // Center kernel is in the middle -- allows for acceleration in Conv.
             int kIdx = 0;
             for (int kz = kernelStart.z(); kz < kernelStart.z() + kernelSize.z(); ++kz) {
                 for (int ky = kernelStart.y(); ky < kernelStart.y() + kernelSize.y(); ++ky) {
-                    for (int kx = kernelStart.x(); kx < kernelStart.x() + kernelSize.x(); ++kx, ++kIdx) {
-                        const nanovdb::Coord& sOffset = sCoord + nanovdb::Coord(kz, ky, kx);
+                    for (int kx = kernelStart.x(); kx < kernelStart.x() + kernelSize.x();
+                         ++kx, ++kIdx) {
+                        const nanovdb::Coord &sOffset = sCoord + nanovdb::Coord(kz, ky, kx);
                         if (sourceAcc.template get<ActiveOrUnmasked<GridType>>(sOffset)) {
-                            outKernelMap[it->second][kIdx] = sourceAcc.getValue(sOffset) - 1 + sourceBaseOffset;
+                            outKernelMap[it->second][kIdx] =
+                                sourceAcc.getValue(sOffset) - 1 + sourceBaseOffset;
                         } else {
                             outKernelMap[it->second][kIdx] = -1;
                         }
@@ -92,46 +97,47 @@ void convolutionKernelMapCPU(const GridBatchImpl::Accessor<GridType>& sourceGrid
     }
 }
 
-
 template <>
-void dispatchConvolutionKernelMap<torch::kCUDA>(const GridBatchImpl& sourceBatchHdl,
-                                                const GridBatchImpl& targetBatchHdl,
-                                                torch::Tensor& kernelMap,
-                                                const Vec3iOrScalar& kernelSize,
-                                                const Vec3iOrScalar& stride) {
-
-    const nanovdb::Coord& kernelSizeCoord = kernelSize.value();
-    const nanovdb::Coord& strideCoord = stride.value();
+void
+dispatchConvolutionKernelMap<torch::kCUDA>(const GridBatchImpl &sourceBatchHdl,
+                                           const GridBatchImpl &targetBatchHdl,
+                                           torch::Tensor       &kernelMap,
+                                           const Vec3iOrScalar &kernelSize,
+                                           const Vec3iOrScalar &stride) {
+    const nanovdb::Coord &kernelSizeCoord = kernelSize.value();
+    const nanovdb::Coord &strideCoord     = stride.value();
     FVDB_DISPATCH_GRID_TYPES(sourceBatchHdl, [&]() {
-        const nanovdb::Coord kernelStart ({(int) std::floor(-kernelSizeCoord.x() / 2.0 + 1),
-                                           (int) std::floor(-kernelSizeCoord.y() / 2.0 + 1),
-                                           (int) std::floor(-kernelSizeCoord.z() / 2.0 + 1) });
+        const nanovdb::Coord kernelStart({ (int)std::floor(-kernelSizeCoord.x() / 2.0 + 1),
+                                           (int)std::floor(-kernelSizeCoord.y() / 2.0 + 1),
+                                           (int)std::floor(-kernelSizeCoord.z() / 2.0 + 1) });
 
         auto sourceBatchAccessor = sourceBatchHdl.deviceAccessor<GridType>();
         auto targetBatchAccessor = targetBatchHdl.deviceAccessor<GridType>();
         auto kernelMapAcc = kernelMap.packed_accessor32<int32_t, 2, torch::RestrictPtrTraits>();
 
-        auto cb = [=] __device__ (int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, BatchGridAccessor<GridType> batchAcc) {
-            convolutionKernelMapVoxelCallback<GridType>(bidx, lidx, vidx, cidx, batchAcc, sourceBatchAccessor, kernelMapAcc, kernelStart, kernelSizeCoord, strideCoord);
+        auto cb = [=] __device__(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+                                 BatchGridAccessor<GridType> batchAcc) {
+            convolutionKernelMapVoxelCallback<GridType>(bidx, lidx, vidx, cidx, batchAcc,
+                                                        sourceBatchAccessor, kernelMapAcc,
+                                                        kernelStart, kernelSizeCoord, strideCoord);
         };
-        forEachVoxelCUDA<GridType>(128, kernelSizeCoord.x() * kernelSizeCoord.y() * kernelSizeCoord.z(), targetBatchHdl, cb);
+        forEachVoxelCUDA<GridType>(128,
+                                   kernelSizeCoord.x() * kernelSizeCoord.y() * kernelSizeCoord.z(),
+                                   targetBatchHdl, cb);
     });
 }
 
 template <>
-void dispatchConvolutionKernelMap<torch::kCPU>(const GridBatchImpl& source,
-                                               const GridBatchImpl& target,
-                                               torch::Tensor& kernelMap,
-                                               const Vec3iOrScalar& kernelSize,
-                                               const Vec3iOrScalar& stride) {
-
-    const nanovdb::Coord& kernelSizeCoord = kernelSize.value();
-    const nanovdb::Coord& strideCoord = stride.value();
+void
+dispatchConvolutionKernelMap<torch::kCPU>(const GridBatchImpl &source, const GridBatchImpl &target,
+                                          torch::Tensor &kernelMap, const Vec3iOrScalar &kernelSize,
+                                          const Vec3iOrScalar &stride) {
+    const nanovdb::Coord &kernelSizeCoord = kernelSize.value();
+    const nanovdb::Coord &strideCoord     = stride.value();
     FVDB_DISPATCH_GRID_TYPES(source, [&]() {
         convolutionKernelMapCPU<GridType>(source.hostAccessor<GridType>(),
-                                          target.hostAccessor<GridType>(),
-                                          kernelSizeCoord, strideCoord,
-                                          kernelMap.accessor<int, 2>());
+                                          target.hostAccessor<GridType>(), kernelSizeCoord,
+                                          strideCoord, kernelMap.accessor<int, 2>());
     });
 }
 
diff --git a/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu b/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
index 026a08ee03..9c1ca1bc95 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
+++ b/fvdb/src/detail/ops/convolution/pack_info/IGEMMBitOperations.cu
@@ -1,26 +1,27 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
-#include <torch/torch.h>
-
 #include "PackInfoOps.h"
-#include "detail/utils/cuda/Utils.cuh"
 
+#include <detail/utils/cuda/Utils.cuh>
+
+#include <c10/cuda/CUDAException.h>
+#include <torch/torch.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-__global__ void bitmaskFromOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchRAcc32<int, 2> bitmask,
-                                            int validN, int kernelVolume, int splitMaskNum) {
-
+__global__ void
+bitmaskFromOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchRAcc32<int, 2> bitmask, int validN,
+                            int kernelVolume, int splitMaskNum) {
     int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    int idx = tidx / splitMaskNum;
-    if (idx >= validN) return;
+    int idx  = tidx / splitMaskNum;
+    if (idx >= validN)
+        return;
 
     int splitMaskIter = tidx % splitMaskNum;
-    int splitMaskLen = (kernelVolume + splitMaskNum - 1) / splitMaskNum;
+    int splitMaskLen  = (kernelVolume + splitMaskNum - 1) / splitMaskNum;
     if (splitMaskIter == (splitMaskNum - 1))
         splitMaskLen = kernelVolume - splitMaskIter * splitMaskLen;
 
@@ -30,49 +31,50 @@ __global__ void bitmaskFromOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchR
         curBitmask += (int)(outInMap[idx][splitMaskIter * splitMaskLen + i] >= 0) * (int)(1u << i);
     }
     bitmask[splitMaskIter][idx] = curBitmask;
-
 }
 
-__global__ void __launch_bounds__(128) reorderOutInMapCallback(TorchRAcc32<int, 2> outInMap,
-                                                               TorchRAcc32<int, 2> reorderLoc,
-                                                               TorchRAcc32<int, 2> reorderOutInMap,
-                                                               int kernelVolume, int splitMaskLen) {
-    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    int idx = tidx / kernelVolume;
+__global__ void
+__launch_bounds__(128)
+    reorderOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchRAcc32<int, 2> reorderLoc,
+                            TorchRAcc32<int, 2> reorderOutInMap, int kernelVolume,
+                            int splitMaskLen) {
+    int tidx          = blockIdx.x * blockDim.x + threadIdx.x;
+    int idx           = tidx / kernelVolume;
     int splitMaskIter = tidx % kernelVolume;
-    if (idx >= outInMap.size(0)) return;
+    if (idx >= outInMap.size(0))
+        return;
 
-    int inputRowIdx = reorderLoc[splitMaskIter / splitMaskLen][idx];
+    int inputRowIdx                     = reorderLoc[splitMaskIter / splitMaskLen][idx];
     reorderOutInMap[idx][splitMaskIter] = outInMap[inputRowIdx][splitMaskIter];
 }
 
-__global__ void __launch_bounds__(128) reduceMaskCallback(TorchRAcc32<int, 2> bitmask,
-                                                          int reduceTile,
-                                                          TorchRAcc32<int, 2> reducedBitmask) {
+__global__ void
+__launch_bounds__(128) reduceMaskCallback(TorchRAcc32<int, 2> bitmask, int reduceTile,
+                                          TorchRAcc32<int, 2> reducedBitmask) {
     int splitMaskIter = blockIdx.y;
-    int threadSize = reduceTile / 4;
-    int laneIdx = threadIdx.x & 31;
-    int warpIdx = threadIdx.x >> 5;
+    int threadSize    = reduceTile / 4;
+    int laneIdx       = threadIdx.x & 31;
+    int warpIdx       = threadIdx.x >> 5;
 
-    int bitmaskLocal = 0;
+    int            bitmaskLocal = 0;
     __shared__ int bitmaskShared[128];
-    int* finalReducePtr = bitmaskShared + (laneIdx << 2);
-    int blockOffset = blockIdx.x * 128 * threadSize;
-    int threadOffset = blockOffset + (threadIdx.x * threadSize);
+    int           *finalReducePtr = bitmaskShared + (laneIdx << 2);
+    int            blockOffset    = blockIdx.x * 128 * threadSize;
+    int            threadOffset   = blockOffset + (threadIdx.x * threadSize);
 
     int loadLen = min(threadSize, bitmask.size(1) - threadOffset);
 
-    #pragma unroll
+#pragma unroll
     for (int i = 0; i < loadLen; i++) {
         int loadOffset = i + threadOffset;
-        bitmaskLocal = bitmaskLocal | bitmask[splitMaskIter][loadOffset];
+        bitmaskLocal   = bitmaskLocal | bitmask[splitMaskIter][loadOffset];
     }
     bitmaskShared[threadIdx.x] = bitmaskLocal;
     __syncthreads();
 
     // Reduce for 1st warp
     if (warpIdx == 0) {
-        #pragma unroll
+#pragma unroll
         for (int i = 1; i < 4; i++) {
             finalReducePtr[0] = finalReducePtr[0] | finalReducePtr[i];
         }
@@ -81,48 +83,56 @@ __global__ void __launch_bounds__(128) reduceMaskCallback(TorchRAcc32<int, 2> bi
             reducedBitmask[splitMaskIter][outputOffset] = finalReducePtr[0];
         }
     }
-
 }
 
-__global__ void transposeOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchRAcc32<int, 2> outInMapT) {
-    int tidx = blockIdx.x * blockDim.x + threadIdx.x;
-    int outIdx = tidx / outInMap.size(1);
+__global__ void
+transposeOutInMapCallback(TorchRAcc32<int, 2> outInMap, TorchRAcc32<int, 2> outInMapT) {
+    int tidx      = blockIdx.x * blockDim.x + threadIdx.x;
+    int outIdx    = tidx / outInMap.size(1);
     int kernelIdx = tidx % outInMap.size(1);
 
-    if (outIdx >= outInMap.size(0)) return;
+    if (outIdx >= outInMap.size(0))
+        return;
     int inIdx = outInMap[outIdx][kernelIdx];
-    if (inIdx < 0) return;
+    if (inIdx < 0)
+        return;
 
     outInMapT[inIdx][kernelIdx] = outIdx;
 }
 
 template <>
-torch::Tensor dispatchBitmaskFromOutInMap<torch::kCUDA>(const torch::Tensor& outInMap, const int splitMaskNum, int validN) {
-    torch::Tensor bitmask = torch::full({splitMaskNum, outInMap.size(0)}, -1,
-        torch::device(outInMap.device()).dtype(torch::ScalarType::Int));
+torch::Tensor
+dispatchBitmaskFromOutInMap<torch::kCUDA>(const torch::Tensor &outInMap, const int splitMaskNum,
+                                          int validN) {
+    torch::Tensor bitmask =
+        torch::full({ splitMaskNum, outInMap.size(0) }, -1,
+                    torch::device(outInMap.device()).dtype(torch::ScalarType::Int));
     if (splitMaskNum > 0 && validN > 0) {
         bitmaskFromOutInMapCallback<<<(splitMaskNum * outInMap.size(0) + 255) / 256, 256>>>(
             outInMap.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
-            bitmask.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
-            validN, outInMap.size(1), splitMaskNum);
+            bitmask.packed_accessor32<int, 2, torch::RestrictPtrTraits>(), validN, outInMap.size(1),
+            splitMaskNum);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
     return bitmask;
 }
 
 template <>
-torch::Tensor dispatchReorderOutInMap<torch::kCUDA>(const torch::Tensor& outInMap, const torch::Tensor& reorderLoc) {
-    int splitMaskNum = reorderLoc.size(0);
-    int splitMaskLen = (outInMap.size(1) + splitMaskNum - 1) / splitMaskNum;
-    torch::Tensor reorderOutInMap = torch::empty({outInMap.size(0), outInMap.size(1)},
-        torch::device(outInMap.device()).dtype(torch::ScalarType::Int));
+torch::Tensor
+dispatchReorderOutInMap<torch::kCUDA>(const torch::Tensor &outInMap,
+                                      const torch::Tensor &reorderLoc) {
+    int           splitMaskNum = reorderLoc.size(0);
+    int           splitMaskLen = (outInMap.size(1) + splitMaskNum - 1) / splitMaskNum;
+    torch::Tensor reorderOutInMap =
+        torch::empty({ outInMap.size(0), outInMap.size(1) },
+                     torch::device(outInMap.device()).dtype(torch::ScalarType::Int));
 
     if (outInMap.size(0) > 0 && outInMap.size(1) > 0) {
         reorderOutInMapCallback<<<(outInMap.size(0) * outInMap.size(1) + 127) / 128, 128>>>(
             outInMap.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
             reorderLoc.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
-            reorderOutInMap.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
-            outInMap.size(1), splitMaskLen);
+            reorderOutInMap.packed_accessor32<int, 2, torch::RestrictPtrTraits>(), outInMap.size(1),
+            splitMaskLen);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 
@@ -130,17 +140,18 @@ torch::Tensor dispatchReorderOutInMap<torch::kCUDA>(const torch::Tensor& outInMa
 }
 
 template <>
-torch::Tensor dispatchReduceMask<torch::kCUDA>(const torch::Tensor& bitmask, const int reduceTile) {
+torch::Tensor
+dispatchReduceMask<torch::kCUDA>(const torch::Tensor &bitmask, const int reduceTile) {
     TORCH_CHECK(reduceTile % 4 == 0, "Reduce tile should be multiple of 4")
     int reducedRowNum = (bitmask.size(1) + reduceTile - 1) / reduceTile;
 
-    torch::Tensor reducedBitmask = torch::zeros({bitmask.size(0), reducedRowNum},
-        torch::device(bitmask.device()).dtype(torch::ScalarType::Int));
+    torch::Tensor reducedBitmask =
+        torch::zeros({ bitmask.size(0), reducedRowNum },
+                     torch::device(bitmask.device()).dtype(torch::ScalarType::Int));
 
     if (bitmask.size(0) > 0 && bitmask.size(1) > 0) {
         reduceMaskCallback<<<dim3((reducedRowNum + 31) / 32, reducedBitmask.size(0)), 128>>>(
-            bitmask.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
-            reduceTile,
+            bitmask.packed_accessor32<int, 2, torch::RestrictPtrTraits>(), reduceTile,
             reducedBitmask.packed_accessor32<int, 2, torch::RestrictPtrTraits>());
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
@@ -149,7 +160,9 @@ torch::Tensor dispatchReduceMask<torch::kCUDA>(const torch::Tensor& bitmask, con
 }
 
 template <>
-void dispatchTransposeOutInMap<torch::kCUDA>(const torch::Tensor& outInMap, const torch::Tensor& outInMapT) {
+void
+dispatchTransposeOutInMap<torch::kCUDA>(const torch::Tensor &outInMap,
+                                        const torch::Tensor &outInMapT) {
     if (outInMap.size(0) > 0 && outInMap.size(1) > 0) {
         transposeOutInMapCallback<<<(outInMap.size(0) * outInMap.size(1) + 255) / 256, 256>>>(
             outInMap.packed_accessor32<int, 2, torch::RestrictPtrTraits>(),
diff --git a/fvdb/src/detail/ops/convolution/pack_info/PackInfoOps.h b/fvdb/src/detail/ops/convolution/pack_info/PackInfoOps.h
index 92e1de5daf..de0d8b278a 100644
--- a/fvdb/src/detail/ops/convolution/pack_info/PackInfoOps.h
+++ b/fvdb/src/detail/ops/convolution/pack_info/PackInfoOps.h
@@ -1,38 +1,42 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_OPS_CONVOLUTION_PACK_INFO_PACKINFOOPS_H
+#define FVDB_DETAIL_OPS_CONVOLUTION_PACK_INFO_PACKINFOOPS_H
+
+#include <Types.h>
+#include <detail/GridBatchImpl.h>
 
 #include <torch/extension.h>
-#include "detail/GridBatchImpl.h"
-#include "Types.h"
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchBrickHaloBuffer(const GridBatchImpl& batchHdl, bool benchmark);
+std::vector<torch::Tensor> dispatchBrickHaloBuffer(const GridBatchImpl &batchHdl, bool benchmark);
 
 template <c10::DeviceType>
-void dispatchConvolutionKernelMap(const GridBatchImpl& source,
-                                  const GridBatchImpl& target,
-                                  torch::Tensor& kernelMap,
-                                  const Vec3iOrScalar& kernelSize,
-                                  const Vec3iOrScalar& stride);
+void dispatchConvolutionKernelMap(const GridBatchImpl &source, const GridBatchImpl &target,
+                                  torch::Tensor &kernelMap, const Vec3iOrScalar &kernelSize,
+                                  const Vec3iOrScalar &stride);
 
 template <c10::DeviceType>
-torch::Tensor dispatchBitmaskFromOutInMap(const torch::Tensor& outInMap, const int splitMaskNum, int validN);
+torch::Tensor dispatchBitmaskFromOutInMap(const torch::Tensor &outInMap, const int splitMaskNum,
+                                          int validN);
 
 template <c10::DeviceType>
-torch::Tensor dispatchReorderOutInMap(const torch::Tensor& outInMap, const torch::Tensor& reorderLoc);
+torch::Tensor dispatchReorderOutInMap(const torch::Tensor &outInMap,
+                                      const torch::Tensor &reorderLoc);
 
 template <c10::DeviceType>
-torch::Tensor dispatchReduceMask(const torch::Tensor& bitmask, const int reduceTile);
+torch::Tensor dispatchReduceMask(const torch::Tensor &bitmask, const int reduceTile);
 
 template <c10::DeviceType>
-void dispatchTransposeOutInMap(const torch::Tensor& outInMap, const torch::Tensor& outInMapT);
+void dispatchTransposeOutInMap(const torch::Tensor &outInMap, const torch::Tensor &outInMapT);
+
+} // namespace ops
+} // namespace detail
+} // namespace fvdb
 
-}
-}
-}
\ No newline at end of file
+#endif // FVDB_DETAIL_OPS_CONVOLUTION_PACK_INFO_PACKINFOOPS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/jagged/JaggedOps.h b/fvdb/src/detail/ops/jagged/JaggedOps.h
index eb88edf2b7..d9deae43ab 100644
--- a/fvdb/src/detail/ops/jagged/JaggedOps.h
+++ b/fvdb/src/detail/ops/jagged/JaggedOps.h
@@ -1,30 +1,35 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_OPS_JAGGED_JAGGEDOPS_H
+#define FVDB_DETAIL_OPS_JAGGED_JAGGEDOPS_H
 
-#include <torch/extension.h>
-
-#include "detail/utils/Utils.h"
-#include "Types.h"
+#include <JaggedTensor.h>
+#include <Types.h>
+#include <detail/utils/Utils.h>
 
+#include <torch/extension.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template <c10::DeviceType>
-torch::Tensor dispatchJaggedArgsort(const JaggedTensor& jt);
+template <c10::DeviceType> torch::Tensor dispatchJaggedArgsort(const JaggedTensor &jt);
 
 template <c10::DeviceType>
-torch::Tensor dispatchJaggedSum(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize);
+torch::Tensor dispatchJaggedSum(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                const torch::Tensor &joffsets, int64_t dimSize);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchJaggedMin(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize);
+std::vector<torch::Tensor> dispatchJaggedMin(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                             const torch::Tensor &joffsets, int64_t dimSize);
 
 template <c10::DeviceType>
-std::vector<torch::Tensor> dispatchJaggedMax(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize);
+std::vector<torch::Tensor> dispatchJaggedMax(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                             const torch::Tensor &joffsets, int64_t dimSize);
 
 } // namespace ops
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_OPS_JAGGED_JAGGEDOPS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/ops/jagged/JaggedReduce.cu b/fvdb/src/detail/ops/jagged/JaggedReduce.cu
index 899f30f767..f91c4e2014 100644
--- a/fvdb/src/detail/ops/jagged/JaggedReduce.cu
+++ b/fvdb/src/detail/ops/jagged/JaggedReduce.cu
@@ -1,12 +1,12 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
+#include "JaggedOps.h"
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/utils/cuda/Atomics.cuh"
-#include "detail/ops/jagged/JaggedOps.h"
+#include <detail/utils/cuda/Atomics.cuh>
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
@@ -18,7 +18,8 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
     // 1. For each reduced slot, init.
     // 2. Atomic write values and use separate kernels to compute arg.
 
-    static inline scalar_t init() {
+    static inline scalar_t
+    init() {
         if constexpr (REDUCE == ReductionType::SUM) {
             return 0;
         } else if constexpr (REDUCE == ReductionType::MIN) {
@@ -30,7 +31,8 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
         }
     }
 
-    static inline __device__ void atomicWriteCUDA(scalar_t* value, scalar_t new_value) {
+    static inline __device__ void
+    atomicWriteCUDA(scalar_t *value, scalar_t new_value) {
         if constexpr (REDUCE == ReductionType::SUM) {
             atomAdd(value, new_value);
         } else if constexpr (REDUCE == ReductionType::MIN) {
@@ -40,7 +42,8 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
         }
     }
 
-    static inline void atomicWriteCPU(scalar_t* value, scalar_t new_value) {
+    static inline void
+    atomicWriteCPU(scalar_t *value, scalar_t new_value) {
         if constexpr (REDUCE == ReductionType::SUM) {
             *value += new_value;
         } else if constexpr (REDUCE == ReductionType::MIN) {
@@ -51,11 +54,12 @@ template <typename scalar_t, ReductionType REDUCE> struct Reducer {
     }
 };
 
-template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor, c10::DeviceType DeviceTag, ReductionType REDUCE>
-__hostdev__ void jaggedReduceCallback(int32_t tidx, int32_t eidx,
-                                      const TensorAccessor<scalar_t, 2> data,
-                                      const TensorAccessor<fvdb::JIdxType, 1> jidx,
-                                      TensorAccessor<scalar_t, 2> out) {
+template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor,
+          c10::DeviceType DeviceTag, ReductionType REDUCE>
+__hostdev__ void
+jaggedReduceCallback(int32_t tidx, int32_t eidx, const TensorAccessor<scalar_t, 2> data,
+                     const TensorAccessor<fvdb::JIdxType, 1> jidx,
+                     TensorAccessor<scalar_t, 2>             out) {
     using RCD = Reducer<scalar_t, REDUCE>;
 
     if constexpr (DeviceTag == torch::kCUDA) {
@@ -65,16 +69,16 @@ __hostdev__ void jaggedReduceCallback(int32_t tidx, int32_t eidx,
     }
 }
 
-template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor, c10::DeviceType DeviceTag>
-__hostdev__ void jaggedArgReduceCallback(int32_t tidx, int32_t eidx,
-                                         const TensorAccessor<scalar_t, 2> data,
-                                         const TensorAccessor<fvdb::JIdxType, 1> jidx,
-                                         const TensorAccessor<fvdb::JOffsetsType, 1> joffsets,
-                                         const TensorAccessor<scalar_t, 2> out,
-                                         TensorAccessor<int64_t, 2> argOut) {
-    const int64_t jidxVal = jidx[tidx];
+template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor,
+          c10::DeviceType DeviceTag>
+__hostdev__ void
+jaggedArgReduceCallback(int32_t tidx, int32_t eidx, const TensorAccessor<scalar_t, 2> data,
+                        const TensorAccessor<fvdb::JIdxType, 1>     jidx,
+                        const TensorAccessor<fvdb::JOffsetsType, 1> joffsets,
+                        const TensorAccessor<scalar_t, 2> out, TensorAccessor<int64_t, 2> argOut) {
+    const int64_t jidxVal    = jidx[tidx];
     const int64_t baseOffset = joffsets[jidxVal];
-    const int64_t index = tidx;
+    const int64_t index      = tidx;
     const int64_t localIndex = index - baseOffset;
     if (data[tidx][eidx] == out[jidxVal][eidx]) {
         if constexpr (DeviceTag == torch::kCUDA) {
@@ -86,18 +90,17 @@ __hostdev__ void jaggedArgReduceCallback(int32_t tidx, int32_t eidx,
 }
 
 template <c10::DeviceType DeviceTag, ReductionType REDUCE>
-std::tuple<torch::Tensor, torch::optional<torch::Tensor>> JaggedReduce(
-        const torch::Tensor& jdataRaw, const torch::Tensor& jidx,
-        const torch::Tensor& joffsets, int64_t dimSize) {
-
+std::tuple<torch::Tensor, torch::optional<torch::Tensor>>
+JaggedReduce(const torch::Tensor &jdataRaw, const torch::Tensor &jidx,
+             const torch::Tensor &joffsets, int64_t dimSize) {
     torch::Tensor jdata = featureCoalescedView(jdataRaw);
 
     static constexpr bool isMinMax = (REDUCE == ReductionType::MIN || REDUCE == ReductionType::MAX);
 
-    torch::Tensor out = torch::empty({dimSize, jdata.size(1)}, jdata.options());
+    torch::Tensor                  out = torch::empty({ dimSize, jdata.size(1) }, jdata.options());
     torch::optional<torch::Tensor> argOut = torch::nullopt;
     if constexpr (isMinMax) {
-        argOut = torch::empty({dimSize, jdata.size(1)}, jidx.options().dtype(torch::kLong));
+        argOut = torch::empty({ dimSize, jdata.size(1) }, jidx.options().dtype(torch::kLong));
     }
 
     auto jidxAccessor = tensorAccessor<DeviceTag, fvdb::JIdxType, 1>(jidx);
@@ -109,13 +112,13 @@ std::tuple<torch::Tensor, torch::optional<torch::Tensor>> JaggedReduce(
         auto outAccessor = tensorAccessor<DeviceTag, scalar_t, 2>(out);
 
         if constexpr (DeviceTag == torch::kCUDA) {
-            auto cb = [=] __device__ (int32_t ridx, int32_t cidx, TorchRAcc32<scalar_t, 2> dataAcc) {
+            auto cb = [=] __device__(int32_t ridx, int32_t cidx, TorchRAcc32<scalar_t, 2> dataAcc) {
                 jaggedReduceCallback<scalar_t, TorchRAcc32, DeviceTag, REDUCE>(
                     ridx, cidx, dataAcc, jidxAccessor, outAccessor);
             };
             forEachTensorElementChannelCUDA<scalar_t, 2>(256, jdata.size(1), jdata, cb);
         } else {
-            auto cb = [=] (int32_t ridx, int32_t cidx, TorchAcc<scalar_t, 2> dataAcc) {
+            auto cb = [=](int32_t ridx, int32_t cidx, TorchAcc<scalar_t, 2> dataAcc) {
                 jaggedReduceCallback<scalar_t, TorchAcc, DeviceTag, REDUCE>(
                     ridx, cidx, dataAcc, jidxAccessor, outAccessor);
             };
@@ -124,74 +127,90 @@ std::tuple<torch::Tensor, torch::optional<torch::Tensor>> JaggedReduce(
 
         // Fill empty slots with 0 instead of Inf/-Inf, and compute arguments.
         if constexpr (isMinMax) {
-            out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t) 0);
+            out.masked_fill_(out == Reducer<scalar_t, REDUCE>::init(), (scalar_t)0);
             argOut->fill_(-1);
 
             auto argOutAccessor = tensorAccessor<DeviceTag, int64_t, 2>(argOut.value());
             if constexpr (DeviceTag == torch::kCUDA) {
-                auto cb = [=] __device__ (int32_t ridx, int32_t cidx, TorchRAcc32<scalar_t, 2> dataAcc) {
+                auto cb = [=] __device__(int32_t ridx, int32_t cidx,
+                                         TorchRAcc32<scalar_t, 2> dataAcc) {
                     jaggedArgReduceCallback<scalar_t, TorchRAcc32, DeviceTag>(
-                        ridx, cidx, dataAcc, jidxAccessor, joffsetsAccessor, outAccessor, argOutAccessor);
+                        ridx, cidx, dataAcc, jidxAccessor, joffsetsAccessor, outAccessor,
+                        argOutAccessor);
                 };
                 forEachTensorElementChannelCUDA<scalar_t, 2>(256, jdata.size(1), jdata, cb);
             } else {
-                auto cb = [=] (int32_t ridx, int32_t cidx, TorchAcc<scalar_t, 2> dataAcc) {
+                auto cb = [=](int32_t ridx, int32_t cidx, TorchAcc<scalar_t, 2> dataAcc) {
                     jaggedArgReduceCallback<scalar_t, TorchAcc, DeviceTag>(
-                        ridx, cidx, dataAcc, jidxAccessor, joffsetsAccessor, outAccessor, argOutAccessor);
+                        ridx, cidx, dataAcc, jidxAccessor, joffsetsAccessor, outAccessor,
+                        argOutAccessor);
                 };
                 forEachTensorElementChannelCPU<scalar_t, 2>(jdata.size(1), jdata, cb);
             }
-
         }
     });
 
-    torch::Tensor rOut = out.reshape(spliceShape({out.size(0)}, jdataRaw));
+    torch::Tensor                  rOut    = out.reshape(spliceShape({ out.size(0) }, jdataRaw));
     torch::optional<torch::Tensor> rArgOut = torch::nullopt;
     if constexpr (isMinMax) {
-        rArgOut = argOut.value().reshape(spliceShape({out.size(0)}, jdataRaw));
+        rArgOut = argOut.value().reshape(spliceShape({ out.size(0) }, jdataRaw));
     }
 
     return std::make_tuple(rOut, rArgOut);
 }
 
 template <>
-torch::Tensor dispatchJaggedSum<torch::kCPU>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+torch::Tensor
+dispatchJaggedSum<torch::kCPU>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                               const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cpu(), "jagged tensor must be on CPU");
-    return std::get<0>(JaggedReduce<torch::kCPU, ReductionType::SUM>(jdata, jidx, joffsets, dimSize));
+    return std::get<0>(
+        JaggedReduce<torch::kCPU, ReductionType::SUM>(jdata, jidx, joffsets, dimSize));
 }
 
 template <>
-torch::Tensor dispatchJaggedSum<torch::kCUDA>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+torch::Tensor
+dispatchJaggedSum<torch::kCUDA>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cuda(), "jagged tensor must be on CUDA");
-    return std::get<0>(JaggedReduce<torch::kCUDA, ReductionType::SUM>(jdata, jidx, joffsets, dimSize));
+    return std::get<0>(
+        JaggedReduce<torch::kCUDA, ReductionType::SUM>(jdata, jidx, joffsets, dimSize));
 }
 
 template <>
-std::vector<torch::Tensor> dispatchJaggedMin<torch::kCPU>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+std::vector<torch::Tensor>
+dispatchJaggedMin<torch::kCPU>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                               const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cpu(), "jagged tensor must be on CPU");
     auto res = JaggedReduce<torch::kCPU, ReductionType::MIN>(jdata, jidx, joffsets, dimSize);
-    return {std::get<0>(res), std::get<1>(res).value()};
+    return { std::get<0>(res), std::get<1>(res).value() };
 }
 
 template <>
-std::vector<torch::Tensor> dispatchJaggedMin<torch::kCUDA>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+std::vector<torch::Tensor>
+dispatchJaggedMin<torch::kCUDA>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cuda(), "jagged tensor must be on CUDA");
     auto res = JaggedReduce<torch::kCUDA, ReductionType::MIN>(jdata, jidx, joffsets, dimSize);
-    return {std::get<0>(res), std::get<1>(res).value()};
+    return { std::get<0>(res), std::get<1>(res).value() };
 }
 
 template <>
-std::vector<torch::Tensor> dispatchJaggedMax<torch::kCPU>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+std::vector<torch::Tensor>
+dispatchJaggedMax<torch::kCPU>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                               const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cpu(), "jagged tensor must be on CPU");
     auto res = JaggedReduce<torch::kCPU, ReductionType::MAX>(jdata, jidx, joffsets, dimSize);
-    return {std::get<0>(res), std::get<1>(res).value()};
+    return { std::get<0>(res), std::get<1>(res).value() };
 }
 
 template <>
-std::vector<torch::Tensor> dispatchJaggedMax<torch::kCUDA>(const torch::Tensor& jdata, const torch::Tensor& jidx, const torch::Tensor& joffsets, int64_t dimSize) {
+std::vector<torch::Tensor>
+dispatchJaggedMax<torch::kCUDA>(const torch::Tensor &jdata, const torch::Tensor &jidx,
+                                const torch::Tensor &joffsets, int64_t dimSize) {
     TORCH_CHECK(jdata.is_cuda(), "jagged tensor must be on CUDA");
     auto res = JaggedReduce<torch::kCUDA, ReductionType::MAX>(jdata, jidx, joffsets, dimSize);
-    return {std::get<0>(res), std::get<1>(res).value()};
+    return { std::get<0>(res), std::get<1>(res).value() };
 }
 
 } // namespace ops
diff --git a/fvdb/src/detail/ops/jagged/JaggedSort.cu b/fvdb/src/detail/ops/jagged/JaggedSort.cu
index 829956d4ec..11f2b27c75 100644
--- a/fvdb/src/detail/ops/jagged/JaggedSort.cu
+++ b/fvdb/src/detail/ops/jagged/JaggedSort.cu
@@ -1,22 +1,22 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <c10/cuda/CUDAException.h>
+#include "JaggedOps.h"
 
-#include "detail/utils/cuda/Utils.cuh"
-#include "detail/ops/jagged/JaggedOps.h"
+#include <detail/utils/cuda/Utils.cuh>
 
+#include <c10/cuda/CUDAException.h>
 
 namespace fvdb {
 namespace detail {
 namespace ops {
 
-template<typename scalar_t, template <typename T, int32_t D> typename TensorAccessor>
-inline __hostdev__ int64_t qsortPartition(TensorAccessor<scalar_t, 1> data,
-                                          TensorAccessor<int64_t, 1> idx,
-                                          int64_t l, int64_t h) {
+template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor>
+inline __hostdev__ int64_t
+qsortPartition(TensorAccessor<scalar_t, 1> data, TensorAccessor<int64_t, 1> idx, int64_t l,
+               int64_t h) {
     // Index of smaller element
-    int64_t i = l - 1;
+    int64_t  i     = l - 1;
     scalar_t pivot = data[h];
 
     for (uint32_t j = l; j < h; ++j) {
@@ -24,24 +24,39 @@ inline __hostdev__ int64_t qsortPartition(TensorAccessor<scalar_t, 1> data,
         if (data[j] <= pivot) {
             // Increment index of smaller element
             i++;
-            { scalar_t tmp = data[j]; data[j] = data[i]; data[i] = tmp; }
-            { int64_t tmp = idx[j]; idx[j] = idx[i]; idx[i] = tmp; }
+            {
+                scalar_t tmp = data[j];
+                data[j]      = data[i];
+                data[i]      = tmp;
+            }
+            {
+                int64_t tmp = idx[j];
+                idx[j]      = idx[i];
+                idx[i]      = tmp;
+            }
         }
     }
-    { scalar_t tmp = data[i+1]; data[i+1] = data[h]; data[h] = tmp; }
-    { int64_t tmp = idx[i+1]; idx[i+1] = idx[h]; idx[h] = tmp; }
+    {
+        scalar_t tmp = data[i + 1];
+        data[i + 1]  = data[h];
+        data[h]      = tmp;
+    }
+    {
+        int64_t tmp = idx[i + 1];
+        idx[i + 1]  = idx[h];
+        idx[h]      = tmp;
+    }
 
     return i + 1;
 }
 
 template <typename scalar_t, template <typename T, int32_t D> typename TensorAccessor>
-__hostdev__ void qsortCallback(int32_t tidx,
-                               const TensorAccessor<int64_t, 2> offsets,
-                               TensorAccessor<scalar_t, 1> data,
-                               TensorAccessor<int64_t, 1> idx,
-                               TensorAccessor<int64_t, 1> stack) {
+__hostdev__ void
+qsortCallback(int32_t tidx, const TensorAccessor<int64_t, 2> offsets,
+              TensorAccessor<scalar_t, 1> data, TensorAccessor<int64_t, 1> idx,
+              TensorAccessor<int64_t, 1> stack) {
     int64_t begin = offsets[tidx][0];
-    int64_t num = offsets[tidx][1] - begin;
+    int64_t num   = offsets[tidx][1] - begin;
 
     int64_t l = 0, h = num - 1;
     int64_t top = -1;
@@ -53,15 +68,15 @@ __hostdev__ void qsortCallback(int32_t tidx,
         h = stack[begin + (top--)];
         l = stack[begin + (top--)];
 
-        int64_t p = qsortPartition<scalar_t, TensorAccessor>(
-            data, idx, begin + l, begin + h) - begin;
+        int64_t p =
+            qsortPartition<scalar_t, TensorAccessor>(data, idx, begin + l, begin + h) - begin;
 
-        if (p-1 > l) {
+        if (p - 1 > l) {
             stack[begin + (++top)] = l;
-            stack[begin + (++top)] = p-1;
+            stack[begin + (++top)] = p - 1;
         }
 
-        if (p+1 < h) {
+        if (p + 1 < h) {
             stack[begin + (++top)] = p + 1;
             stack[begin + (++top)] = h;
         }
@@ -69,8 +84,9 @@ __hostdev__ void qsortCallback(int32_t tidx,
 }
 
 template <c10::DeviceType DeviceTag>
-torch::Tensor JaggedArgsort(const JaggedTensor& jt) {
-    torch::Tensor data = jt.jdata();
+torch::Tensor
+JaggedArgsort(const JaggedTensor &jt) {
+    torch::Tensor data    = jt.jdata();
     torch::Tensor offsets = jt.joffsets();
 
     if (data.ndimension() != 1) {
@@ -83,38 +99,44 @@ torch::Tensor JaggedArgsort(const JaggedTensor& jt) {
 
     auto longOption = torch::TensorOptions().dtype(torch::kInt64).device(data.device());
 
-    torch::Tensor idx = torch::arange(data.size(0), longOption);
-    torch::Tensor stack = torch::zeros({data.size(0)}, longOption);
+    torch::Tensor idx   = torch::arange(data.size(0), longOption);
+    torch::Tensor stack = torch::zeros({ data.size(0) }, longOption);
 
-    auto idxAccessor = tensorAccessor<DeviceTag, int64_t, 1>(idx);
+    auto idxAccessor   = tensorAccessor<DeviceTag, int64_t, 1>(idx);
     auto stackAccessor = tensorAccessor<DeviceTag, int64_t, 1>(stack);
 
-    AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, data.scalar_type(), "JaggedArgsort", [&]() {
-        auto dataAcc = tensorAccessor<DeviceTag, scalar_t, 1>(data);
-        if constexpr (DeviceTag == torch::kCUDA) {
-            auto cb = [=] __device__ (int32_t ridx, int32_t cidx, TorchRAcc32<int64_t, 2> offsetAcc) {
-                qsortCallback<scalar_t, TorchRAcc32>(ridx, offsetAcc, dataAcc, idxAccessor, stackAccessor);
-            };
-            forEachTensorElementChannelCUDA<int64_t, 2>(256, 1, offsets, cb);
-        } else {
-            auto cb = [=] (int32_t ridx, int32_t cidx, TorchAcc<int64_t, 2> offsetAcc) {
-                qsortCallback<scalar_t, TorchAcc>(ridx, offsetAcc, dataAcc, idxAccessor, stackAccessor);
-            };
-            forEachTensorElementChannelCPU<int64_t, 2>(1, offsets, cb);
-        }
-    });
+    AT_DISPATCH_ALL_TYPES_AND2(
+        at::ScalarType::Half, at::ScalarType::BFloat16, data.scalar_type(), "JaggedArgsort", [&]() {
+            auto dataAcc = tensorAccessor<DeviceTag, scalar_t, 1>(data);
+            if constexpr (DeviceTag == torch::kCUDA) {
+                auto cb = [=] __device__(int32_t ridx, int32_t cidx,
+                                         TorchRAcc32<int64_t, 2> offsetAcc) {
+                    qsortCallback<scalar_t, TorchRAcc32>(ridx, offsetAcc, dataAcc, idxAccessor,
+                                                         stackAccessor);
+                };
+                forEachTensorElementChannelCUDA<int64_t, 2>(256, 1, offsets, cb);
+            } else {
+                auto cb = [=](int32_t ridx, int32_t cidx, TorchAcc<int64_t, 2> offsetAcc) {
+                    qsortCallback<scalar_t, TorchAcc>(ridx, offsetAcc, dataAcc, idxAccessor,
+                                                      stackAccessor);
+                };
+                forEachTensorElementChannelCPU<int64_t, 2>(1, offsets, cb);
+            }
+        });
 
     return idx;
 }
 
 template <>
-torch::Tensor dispatchJaggedArgsort<torch::kCPU>(const JaggedTensor& jt) {
+torch::Tensor
+dispatchJaggedArgsort<torch::kCPU>(const JaggedTensor &jt) {
     TORCH_CHECK(jt.is_cpu(), "jagged tensor must be on CPU");
     return JaggedArgsort<torch::kCPU>(jt);
 }
 
 template <>
-torch::Tensor dispatchJaggedArgsort<torch::kCUDA>(const JaggedTensor& jt) {
+torch::Tensor
+dispatchJaggedArgsort<torch::kCUDA>(const JaggedTensor &jt) {
     TORCH_CHECK(jt.is_cuda(), "jagged tensor must be on CUDA");
     return JaggedArgsort<torch::kCUDA>(jt);
 }
diff --git a/fvdb/src/detail/utils/BezierInterpolationIterator.h b/fvdb/src/detail/utils/BezierInterpolationIterator.h
index 4a7a9409fb..583576bf03 100644
--- a/fvdb/src/detail/utils/BezierInterpolationIterator.h
+++ b/fvdb/src/detail/utils/BezierInterpolationIterator.h
@@ -1,77 +1,82 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONITERATOR_H
+#define FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONITERATOR_H
 
 #include <nanovdb/NanoVDB.h>
 
-
 namespace fvdb {
 namespace detail {
 
-template <typename ScalarT>
-struct BezierInterpolationIterator {
+template <typename ScalarT> struct BezierInterpolationIterator {
     struct PairT {
         nanovdb::Coord first;
-        ScalarT second;
+        ScalarT        second;
     };
 
-    using value_type = PairT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = PairT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     BezierInterpolationIterator() = delete;
 
-    __hostdev__ BezierInterpolationIterator(const nanovdb::math::Vec3<ScalarT>& p) {
+    __hostdev__
+    BezierInterpolationIterator(const nanovdb::math::Vec3<ScalarT> &p) {
         mCount = 0;
         mVoxel = p.round();
-        mUVW = p - nanovdb::math::Vec3<ScalarT>(mVoxel.x(), mVoxel.y(), mVoxel.z());
+        mUVW   = p - nanovdb::math::Vec3<ScalarT>(mVoxel.x(), mVoxel.y(), mVoxel.z());
         updateCoordAndWeight();
     }
 
-    __hostdev__ inline const BezierInterpolationIterator& operator++() {
+    __hostdev__ inline const BezierInterpolationIterator &
+    operator++() {
         mCount += 1;
-        if (!isValid()) { return *this; }
+        if (!isValid()) {
+            return *this;
+        }
         updateCoordAndWeight();
         return *this;
     }
 
-    __hostdev__
-    BezierInterpolationIterator operator++(int) {
-        BezierInterpolationIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ BezierInterpolationIterator
+    operator++(int) {
+        BezierInterpolationIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Dereferencable.
-    __hostdev__
-    inline constexpr const value_type& operator*() const {
+    __hostdev__ inline constexpr const value_type &
+    operator*() const {
         return mCoordAndWeight;
     }
 
-    __hostdev__
-    inline constexpr const value_type* operator->() const {
-        return (const value_type*) &mCoordAndWeight;
+    __hostdev__ inline constexpr const value_type *
+    operator->() const {
+        return (const value_type *)&mCoordAndWeight;
     }
 
     // Equality / inequality.
-    __hostdev__
-    inline constexpr bool operator==(const BezierInterpolationIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator==(const BezierInterpolationIterator &rhs) const {
         return mVoxel == rhs.mVoxel && mCount == rhs.mCount;
     }
 
-    __hostdev__
-    inline constexpr bool operator!=(const BezierInterpolationIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator!=(const BezierInterpolationIterator &rhs) const {
         return !(*this == rhs);
     }
 
-    __hostdev__
-    inline constexpr bool isValid() {
+    __hostdev__ inline constexpr bool
+    isValid() {
         return mCount < 27;
     }
 
-protected:
-
-    __hostdev__ inline ScalarT bezier(const ScalarT x) {
+  protected:
+    __hostdev__ inline ScalarT
+    bezier(const ScalarT x) {
         bool r1 = x < -1.5, r2 = x < -0.5, r3 = x < 0.5, r4 = x < 1.5;
         if (!r1 && r2) {
             return (x + 1.5) * (x + 1.5);
@@ -83,19 +88,22 @@ struct BezierInterpolationIterator {
         return 0.0;
     }
 
-    __hostdev__ inline void updateCoordAndWeight() {
-        const int32_t dz = mCount % 3 - 1;
-        const int32_t dy = (mCount / 3) % 3 - 1;
-        const int32_t dx = mCount / 9 - 1;
-        ScalarT res = bezier(mUVW[0] - dx) * bezier(mUVW[1] - dy) * bezier(mUVW[2] - dz);
-        mCoordAndWeight = {nanovdb::Coord(dx, dy, dz) + mVoxel, res};
+    __hostdev__ inline void
+    updateCoordAndWeight() {
+        const int32_t dz  = mCount % 3 - 1;
+        const int32_t dy  = (mCount / 3) % 3 - 1;
+        const int32_t dx  = mCount / 9 - 1;
+        ScalarT       res = bezier(mUVW[0] - dx) * bezier(mUVW[1] - dy) * bezier(mUVW[2] - dz);
+        mCoordAndWeight   = { nanovdb::Coord(dx, dy, dz) + mVoxel, res };
     }
 
-    int32_t mCount = 0;
-    value_type mCoordAndWeight;
-    nanovdb::Coord mVoxel;
+    int32_t                      mCount = 0;
+    value_type                   mCoordAndWeight;
+    nanovdb::Coord               mVoxel;
     nanovdb::math::Vec3<ScalarT> mUVW;
 };
 
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONITERATOR_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/BezierInterpolationWithGradIterator.h b/fvdb/src/detail/utils/BezierInterpolationWithGradIterator.h
index 9f7f2f06cc..f8aaeb78fa 100644
--- a/fvdb/src/detail/utils/BezierInterpolationWithGradIterator.h
+++ b/fvdb/src/detail/utils/BezierInterpolationWithGradIterator.h
@@ -1,76 +1,82 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONWITHGRADITERATOR_H
+#define FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONWITHGRADITERATOR_H
 
 #include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 namespace detail {
 
-template <typename ScalarT>
-struct BezierInterpolationWithGradIterator {
+template <typename ScalarT> struct BezierInterpolationWithGradIterator {
     struct PairT {
-        nanovdb::Coord first;
+        nanovdb::Coord               first;
         nanovdb::math::Vec4<ScalarT> second;
     };
 
-    using value_type = PairT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = PairT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     BezierInterpolationWithGradIterator() = delete;
 
-    __hostdev__ BezierInterpolationWithGradIterator(const nanovdb::math::Vec3<ScalarT>& p) {
+    __hostdev__
+    BezierInterpolationWithGradIterator(const nanovdb::math::Vec3<ScalarT> &p) {
         mCount = 0;
         mVoxel = p.round();
-        mUVW = p - nanovdb::math::Vec3<ScalarT>(mVoxel.x(), mVoxel.y(), mVoxel.z());
+        mUVW   = p - nanovdb::math::Vec3<ScalarT>(mVoxel.x(), mVoxel.y(), mVoxel.z());
         updateCoordAndWeight();
     }
 
-    __hostdev__ inline const BezierInterpolationWithGradIterator& operator++() {
+    __hostdev__ inline const BezierInterpolationWithGradIterator &
+    operator++() {
         mCount += 1;
-        if (!isValid()) { return *this; }
+        if (!isValid()) {
+            return *this;
+        }
         updateCoordAndWeight();
         return *this;
     }
 
-    __hostdev__
-    BezierInterpolationWithGradIterator operator++(int) {
-        BezierInterpolationWithGradIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ BezierInterpolationWithGradIterator
+    operator++(int) {
+        BezierInterpolationWithGradIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Dereferencable.
-    __hostdev__
-    inline constexpr const value_type& operator*() const {
+    __hostdev__ inline constexpr const value_type &
+    operator*() const {
         return mCoordAndWXYZ;
     }
 
-    __hostdev__
-    inline constexpr const value_type* operator->() const {
-        return (const value_type*) &mCoordAndWXYZ;
+    __hostdev__ inline constexpr const value_type *
+    operator->() const {
+        return (const value_type *)&mCoordAndWXYZ;
     }
 
     // Equality / inequality.
-    __hostdev__
-    inline constexpr bool operator==(const BezierInterpolationWithGradIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator==(const BezierInterpolationWithGradIterator &rhs) const {
         return mVoxel == rhs.mVoxel && mCount == rhs.mCount;
     }
 
-    __hostdev__
-    inline constexpr bool operator!=(const BezierInterpolationWithGradIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator!=(const BezierInterpolationWithGradIterator &rhs) const {
         return !(*this == rhs);
     }
 
-    __hostdev__
-    inline constexpr bool isValid() {
+    __hostdev__ inline constexpr bool
+    isValid() {
         return mCount < 27;
     }
 
-protected:
-
-    __hostdev__ inline ScalarT bezier(const ScalarT x) {
+  protected:
+    __hostdev__ inline ScalarT
+    bezier(const ScalarT x) {
         bool r1 = x < -1.5, r2 = x < -0.5, r3 = x < 0.5, r4 = x < 1.5;
         if (!r1 && r2) {
             return (x + 1.5) * (x + 1.5);
@@ -82,7 +88,8 @@ struct BezierInterpolationWithGradIterator {
         return 0.0;
     }
 
-    __hostdev__ inline ScalarT bezierGrad(const ScalarT x) {
+    __hostdev__ inline ScalarT
+    bezierGrad(const ScalarT x) {
         bool r1 = x < -1.5, r2 = x < -0.5, r3 = x < 0.5, r4 = x < 1.5;
         if (!r1 && r2) {
             return 2 * x + 3;
@@ -94,21 +101,26 @@ struct BezierInterpolationWithGradIterator {
         return 0.0;
     }
 
-    __hostdev__ inline void updateCoordAndWeight() {
+    __hostdev__ inline void
+    updateCoordAndWeight() {
         const int32_t dz = mCount % 3 - 1;
         const int32_t dy = (mCount / 3) % 3 - 1;
         const int32_t dx = mCount / 9 - 1;
         ScalarT bx = bezier(mUVW[0] - dx), by = bezier(mUVW[1] - dy), bz = bezier(mUVW[2] - dz);
-        ScalarT dbx = bezierGrad(mUVW[0] - dx), dby = bezierGrad(mUVW[1] - dy), dbz = bezierGrad(mUVW[2] - dz);
-        mCoordAndWXYZ = {nanovdb::Coord(dx, dy, dz) + mVoxel, nanovdb::math::Vec4<ScalarT>(
-            bx * by * bz, dbx * by * bz, bx * dby * bz, bx * by * dbz)};
+        ScalarT dbx = bezierGrad(mUVW[0] - dx), dby = bezierGrad(mUVW[1] - dy),
+                dbz   = bezierGrad(mUVW[2] - dz);
+        mCoordAndWXYZ = { nanovdb::Coord(dx, dy, dz) + mVoxel,
+                          nanovdb::math::Vec4<ScalarT>(bx * by * bz, dbx * by * bz, bx * dby * bz,
+                                                       bx * by * dbz) };
     }
 
-    int32_t mCount = 0;
-    value_type mCoordAndWXYZ;
-    nanovdb::Coord mVoxel;
+    int32_t                      mCount = 0;
+    value_type                   mCoordAndWXYZ;
+    nanovdb::Coord               mVoxel;
     nanovdb::math::Vec3<ScalarT> mUVW;
 };
 
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_BEZIERINTERPOLATIONWITHGRADITERATOR_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/MarchingCubesData.h b/fvdb/src/detail/utils/MarchingCubesData.h
index dadd062a3c..3e315fd3f3 100644
--- a/fvdb/src/detail/utils/MarchingCubesData.h
+++ b/fvdb/src/detail/utils/MarchingCubesData.h
@@ -1,8 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#ifndef FVDB_DETAIL_UTILS_MARCHINGCUBESDATA_H
+#define FVDB_DETAIL_UTILS_MARCHINGCUBESDATA_H
+
 #if defined(__CUDACC__) && defined(__CUDA_ARCH__)
-#define _CPU_AND_GPU_CONSTANT_ __constant__	// for CUDA device code
+#define _CPU_AND_GPU_CONSTANT_ __constant__ // for CUDA device code
 #else
 #define _CPU_AND_GPU_CONSTANT_ const
 #endif
@@ -10,177 +13,314 @@
 namespace fvdb {
 namespace detail {
 
-_CPU_AND_GPU_CONSTANT_ int marchingCubesEdgeTable[256] = { 0x0, 0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, 0xb06, 0xc0a, 0xd03, 0xe09, 0xf00,
-                                                           0x190, 0x99, 0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c, 0x99c, 0x895, 0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90, 0x230, 0x339, 0x33, 0x13a,
-                                                           0x636, 0x73f, 0x435, 0x53c, 0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30, 0x3a0, 0x2a9, 0x1a3, 0xaa, 0x7a6, 0x6af, 0x5a5, 0x4ac,
-                                                           0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0, 0x460, 0x569, 0x663, 0x76a, 0x66, 0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66,
-                                                           0x86a, 0x963, 0xa69, 0xb60, 0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff, 0x3f5, 0x2fc, 0xdfc, 0xcf5, 0xfff, 0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0,
-                                                           0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55, 0x15c, 0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950, 0x7c0, 0x6c9, 0x5c3, 0x4ca,
-                                                           0x3c6, 0x2cf, 0x1c5, 0xcc, 0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0, 0x8c0, 0x9c9, 0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc,
-                                                           0xcc, 0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9, 0x7c0, 0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55, 0x35f, 0x256,
-                                                           0x55a, 0x453, 0x759, 0x650, 0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc, 0x2fc, 0x3f5, 0xff, 0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0,
-                                                           0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f, 0xd65, 0xc6c, 0x36c, 0x265, 0x16f, 0x66, 0x76a, 0x663, 0x569, 0x460, 0xca0, 0xda9, 0xea3, 0xfaa,
-                                                           0x8a6, 0x9af, 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa, 0x1a3, 0x2a9, 0x3a0, 0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c,
-                                                           0x53c, 0x435, 0x73f, 0x636, 0x13a, 0x33, 0x339, 0x230, 0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c, 0x69c, 0x795, 0x49f, 0x596,
-                                                           0x29a, 0x393, 0x99, 0x190, 0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905, 0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0 };
+_CPU_AND_GPU_CONSTANT_ int marchingCubesEdgeTable[256] = {
+    0x0,   0x109, 0x203, 0x30a, 0x406, 0x50f, 0x605, 0x70c, 0x80c, 0x905, 0xa0f, 0xb06, 0xc0a,
+    0xd03, 0xe09, 0xf00, 0x190, 0x99,  0x393, 0x29a, 0x596, 0x49f, 0x795, 0x69c, 0x99c, 0x895,
+    0xb9f, 0xa96, 0xd9a, 0xc93, 0xf99, 0xe90, 0x230, 0x339, 0x33,  0x13a, 0x636, 0x73f, 0x435,
+    0x53c, 0xa3c, 0xb35, 0x83f, 0x936, 0xe3a, 0xf33, 0xc39, 0xd30, 0x3a0, 0x2a9, 0x1a3, 0xaa,
+    0x7a6, 0x6af, 0x5a5, 0x4ac, 0xbac, 0xaa5, 0x9af, 0x8a6, 0xfaa, 0xea3, 0xda9, 0xca0, 0x460,
+    0x569, 0x663, 0x76a, 0x66,  0x16f, 0x265, 0x36c, 0xc6c, 0xd65, 0xe6f, 0xf66, 0x86a, 0x963,
+    0xa69, 0xb60, 0x5f0, 0x4f9, 0x7f3, 0x6fa, 0x1f6, 0xff,  0x3f5, 0x2fc, 0xdfc, 0xcf5, 0xfff,
+    0xef6, 0x9fa, 0x8f3, 0xbf9, 0xaf0, 0x650, 0x759, 0x453, 0x55a, 0x256, 0x35f, 0x55,  0x15c,
+    0xe5c, 0xf55, 0xc5f, 0xd56, 0xa5a, 0xb53, 0x859, 0x950, 0x7c0, 0x6c9, 0x5c3, 0x4ca, 0x3c6,
+    0x2cf, 0x1c5, 0xcc,  0xfcc, 0xec5, 0xdcf, 0xcc6, 0xbca, 0xac3, 0x9c9, 0x8c0, 0x8c0, 0x9c9,
+    0xac3, 0xbca, 0xcc6, 0xdcf, 0xec5, 0xfcc, 0xcc,  0x1c5, 0x2cf, 0x3c6, 0x4ca, 0x5c3, 0x6c9,
+    0x7c0, 0x950, 0x859, 0xb53, 0xa5a, 0xd56, 0xc5f, 0xf55, 0xe5c, 0x15c, 0x55,  0x35f, 0x256,
+    0x55a, 0x453, 0x759, 0x650, 0xaf0, 0xbf9, 0x8f3, 0x9fa, 0xef6, 0xfff, 0xcf5, 0xdfc, 0x2fc,
+    0x3f5, 0xff,  0x1f6, 0x6fa, 0x7f3, 0x4f9, 0x5f0, 0xb60, 0xa69, 0x963, 0x86a, 0xf66, 0xe6f,
+    0xd65, 0xc6c, 0x36c, 0x265, 0x16f, 0x66,  0x76a, 0x663, 0x569, 0x460, 0xca0, 0xda9, 0xea3,
+    0xfaa, 0x8a6, 0x9af, 0xaa5, 0xbac, 0x4ac, 0x5a5, 0x6af, 0x7a6, 0xaa,  0x1a3, 0x2a9, 0x3a0,
+    0xd30, 0xc39, 0xf33, 0xe3a, 0x936, 0x83f, 0xb35, 0xa3c, 0x53c, 0x435, 0x73f, 0x636, 0x13a,
+    0x33,  0x339, 0x230, 0xe90, 0xf99, 0xc93, 0xd9a, 0xa96, 0xb9f, 0x895, 0x99c, 0x69c, 0x795,
+    0x49f, 0x596, 0x29a, 0x393, 0x99,  0x190, 0xf00, 0xe09, 0xd03, 0xc0a, 0xb06, 0xa0f, 0x905,
+    0x80c, 0x70c, 0x605, 0x50f, 0x406, 0x30a, 0x203, 0x109, 0x0
+};
 
-_CPU_AND_GPU_CONSTANT_ int marchingCubesTriTable[256][16] = { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1 }, { 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1 }, { 3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1 }, { 3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1 }, { 9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1 }, { 8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1 }, { 9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1 }, { 3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1 }, { 4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1 },
-                                                              { 4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1 }, { 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1 }, { 5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1 }, { 9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1 }, { 0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1 }, { 10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1 }, { 5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1 },
-                                                              { 5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1 }, { 9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1 }, { 0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1 }, { 8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1 },
-                                                              { 2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1 }, { 7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1 }, { 2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1 },
-                                                              { 11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1 }, { 9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1 },
-                                                              { 5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1 }, { 11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1 },
-                                                              { 11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1 }, { 1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1 }, { 9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1 }, { 2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1 }, { 0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1 }, { 6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1 }, { 3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1 },
-                                                              { 6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1 }, { 5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1 }, { 1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1 }, { 6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1 }, { 8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1 },
-                                                              { 7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1 }, { 3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1 }, { 0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1 },
-                                                              { 9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1 }, { 8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1 },
-                                                              { 5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1 }, { 0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1 },
-                                                              { 6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1 }, { 10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1 }, { 10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1 }, { 1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1 }, { 0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1 }, { 10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1 }, { 3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1 },
-                                                              { 6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1 }, { 9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1 },
-                                                              { 8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1 }, { 3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1 }, { 10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1 },
-                                                              { 10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1 },
-                                                              { 2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1 }, { 7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1 },
-                                                              { 2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1 }, { 1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1 },
-                                                              { 11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1 }, { 8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1 },
-                                                              { 0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1 },
-                                                              { 7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1 }, { 10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1 }, { 2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1 }, { 7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1 }, { 2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1 }, { 10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1 }, { 0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1 },
-                                                              { 7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1 }, { 6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1 }, { 8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1 }, { 6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1 }, { 4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1 },
-                                                              { 10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1 }, { 8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1 },
-                                                              { 1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1 }, { 8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1 },
-                                                              { 10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1 }, { 4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1 },
-                                                              { 10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1 }, { 5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1 }, { 9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1 }, { 7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1 },
-                                                              { 3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1 }, { 7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1 }, { 3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1 },
-                                                              { 6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1 }, { 9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1 },
-                                                              { 1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1 }, { 4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1 },
-                                                              { 7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1 }, { 6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1 }, { 0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1 },
-                                                              { 6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1 },
-                                                              { 0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1 }, { 11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1 },
-                                                              { 6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1 }, { 5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1 },
-                                                              { 9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1 }, { 1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1 },
-                                                              { 1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1 },
-                                                              { 10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1 }, { 0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1 }, { 5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1 }, { 11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1 }, { 9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1 },
-                                                              { 7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1 }, { 2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1 }, { 9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1 },
-                                                              { 9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1 }, { 1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1 }, { 9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1 }, { 0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1 },
-                                                              { 10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1 }, { 2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1 },
-                                                              { 0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1 }, { 0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1 },
-                                                              { 9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1 },
-                                                              { 5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1 }, { 3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1 },
-                                                              { 5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1 }, { 8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1 },
-                                                              { 9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1 }, { 1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1 },
-                                                              { 3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1 }, { 4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1 },
-                                                              { 9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1 }, { 11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1 }, { 2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1 },
-                                                              { 9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1 }, { 3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1 },
-                                                              { 1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1 }, { 4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1 }, { 0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1 }, { 0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1 },
-                                                              { 1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { 0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-                                                              { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 } };
+_CPU_AND_GPU_CONSTANT_ int marchingCubesTriTable[256][16] = {
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 1, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 8, 3, 9, 8, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 2, 10, 0, 2, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 8, 3, 2, 10, 8, 10, 9, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 11, 2, 8, 11, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 9, 0, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 11, 2, 1, 9, 11, 9, 8, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 10, 1, 11, 10, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 10, 1, 0, 8, 10, 8, 11, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 9, 0, 3, 11, 9, 11, 10, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 3, 0, 7, 3, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 1, 9, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 1, 9, 4, 7, 1, 7, 3, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 4, 7, 3, 0, 4, 1, 2, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 2, 10, 9, 0, 2, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4, -1, -1, -1, -1 },
+    { 8, 4, 7, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 4, 7, 11, 2, 4, 2, 0, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 0, 1, 8, 4, 7, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1, -1, -1, -1, -1 },
+    { 3, 10, 1, 3, 11, 10, 7, 8, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4, -1, -1, -1, -1 },
+    { 4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3, -1, -1, -1, -1 },
+    { 4, 7, 11, 4, 11, 9, 9, 11, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 5, 4, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 5, 4, 1, 5, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 5, 4, 8, 3, 5, 3, 1, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 0, 8, 1, 2, 10, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 2, 10, 5, 4, 2, 4, 0, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8, -1, -1, -1, -1 },
+    { 9, 5, 4, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 11, 2, 0, 8, 11, 4, 9, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 5, 4, 0, 1, 5, 2, 3, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5, -1, -1, -1, -1 },
+    { 10, 3, 11, 10, 1, 3, 9, 5, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10, -1, -1, -1, -1 },
+    { 5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3, -1, -1, -1, -1 },
+    { 5, 4, 8, 5, 8, 10, 10, 8, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 7, 8, 5, 7, 9, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 3, 0, 9, 5, 3, 5, 7, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 7, 8, 0, 1, 7, 1, 5, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 7, 8, 9, 5, 7, 10, 1, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3, -1, -1, -1, -1 },
+    { 8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2, -1, -1, -1, -1 },
+    { 2, 10, 5, 2, 5, 3, 3, 5, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 9, 5, 7, 8, 9, 3, 11, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11, -1, -1, -1, -1 },
+    { 2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7, -1, -1, -1, -1 },
+    { 11, 2, 1, 11, 1, 7, 7, 1, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11, -1, -1, -1, -1 },
+    { 5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0, -1 },
+    { 11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0, -1 },
+    { 11, 10, 5, 7, 11, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 0, 1, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 8, 3, 1, 9, 8, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 6, 5, 2, 6, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 6, 5, 1, 2, 6, 3, 0, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 6, 5, 9, 0, 6, 0, 2, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8, -1, -1, -1, -1 },
+    { 2, 3, 11, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 0, 8, 11, 2, 0, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 1, 9, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11, -1, -1, -1, -1 },
+    { 6, 3, 11, 6, 5, 3, 5, 1, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6, -1, -1, -1, -1 },
+    { 3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9, -1, -1, -1, -1 },
+    { 6, 5, 9, 6, 9, 11, 11, 9, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 10, 6, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 3, 0, 4, 7, 3, 6, 5, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 9, 0, 5, 10, 6, 8, 4, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4, -1, -1, -1, -1 },
+    { 6, 1, 2, 6, 5, 1, 4, 7, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7, -1, -1, -1, -1 },
+    { 8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6, -1, -1, -1, -1 },
+    { 7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9, -1 },
+    { 3, 11, 2, 7, 8, 4, 10, 6, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11, -1, -1, -1, -1 },
+    { 0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6, -1, -1, -1, -1 },
+    { 9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6, -1 },
+    { 8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6, -1, -1, -1, -1 },
+    { 5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11, -1 },
+    { 0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7, -1 },
+    { 6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9, -1, -1, -1, -1 },
+    { 10, 4, 9, 6, 4, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 10, 6, 4, 9, 10, 0, 8, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 0, 1, 10, 6, 0, 6, 4, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10, -1, -1, -1, -1 },
+    { 1, 4, 9, 1, 2, 4, 2, 6, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4, -1, -1, -1, -1 },
+    { 0, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 3, 2, 8, 2, 4, 4, 2, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 4, 9, 10, 6, 4, 11, 2, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6, -1, -1, -1, -1 },
+    { 3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10, -1, -1, -1, -1 },
+    { 6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1, -1 },
+    { 9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3, -1, -1, -1, -1 },
+    { 8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1, -1 },
+    { 3, 11, 6, 3, 6, 0, 0, 6, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 6, 4, 8, 11, 6, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 10, 6, 7, 8, 10, 8, 9, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10, -1, -1, -1, -1 },
+    { 10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0, -1, -1, -1, -1 },
+    { 10, 6, 7, 10, 7, 1, 1, 7, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7, -1, -1, -1, -1 },
+    { 2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9, -1 },
+    { 7, 8, 0, 7, 0, 6, 6, 0, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 3, 2, 6, 7, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7, -1, -1, -1, -1 },
+    { 2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7, -1 },
+    { 1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11, -1 },
+    { 11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1, -1, -1, -1, -1 },
+    { 8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6, -1 },
+    { 0, 9, 1, 11, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0, -1, -1, -1, -1 },
+    { 7, 11, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 0, 8, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 1, 9, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 1, 9, 8, 3, 1, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 1, 2, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, 3, 0, 8, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 9, 0, 2, 10, 9, 6, 11, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8, -1, -1, -1, -1 },
+    { 7, 2, 3, 6, 2, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 7, 0, 8, 7, 6, 0, 6, 2, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 7, 6, 2, 3, 7, 0, 1, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6, -1, -1, -1, -1 },
+    { 10, 7, 6, 10, 1, 7, 1, 3, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8, -1, -1, -1, -1 },
+    { 0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7, -1, -1, -1, -1 },
+    { 7, 6, 10, 7, 10, 8, 8, 10, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 6, 8, 4, 11, 8, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 6, 11, 3, 0, 6, 0, 4, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 6, 11, 8, 4, 6, 9, 0, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6, -1, -1, -1, -1 },
+    { 6, 8, 4, 6, 11, 8, 2, 10, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6, -1, -1, -1, -1 },
+    { 4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9, -1, -1, -1, -1 },
+    { 10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3, -1 },
+    { 8, 2, 3, 8, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 4, 2, 4, 6, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8, -1, -1, -1, -1 },
+    { 1, 9, 4, 1, 4, 2, 2, 4, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1, -1, -1, -1, -1 },
+    { 10, 1, 0, 10, 0, 6, 6, 0, 4, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3, -1 },
+    { 10, 9, 4, 6, 10, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 9, 5, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, 4, 9, 5, 11, 7, 6, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 0, 1, 5, 4, 0, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5, -1, -1, -1, -1 },
+    { 9, 5, 4, 10, 1, 2, 7, 6, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5, -1, -1, -1, -1 },
+    { 7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2, -1, -1, -1, -1 },
+    { 3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6, -1 },
+    { 7, 2, 3, 7, 6, 2, 5, 4, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7, -1, -1, -1, -1 },
+    { 3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0, -1, -1, -1, -1 },
+    { 6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8, -1 },
+    { 9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7, -1, -1, -1, -1 },
+    { 1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4, -1 },
+    { 4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10, -1 },
+    { 7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10, -1, -1, -1, -1 },
+    { 6, 9, 5, 6, 11, 9, 11, 8, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5, -1, -1, -1, -1 },
+    { 0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11, -1, -1, -1, -1 },
+    { 6, 11, 3, 6, 3, 5, 5, 3, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6, -1, -1, -1, -1 },
+    { 0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10, -1 },
+    { 11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5, -1 },
+    { 6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3, -1, -1, -1, -1 },
+    { 5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2, -1, -1, -1, -1 },
+    { 9, 5, 6, 9, 6, 0, 0, 6, 2, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8, -1 },
+    { 1, 5, 6, 2, 1, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6, -1 },
+    { 10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0, -1, -1, -1, -1 },
+    { 0, 3, 8, 5, 6, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 5, 6, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 5, 10, 7, 5, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 5, 10, 11, 7, 5, 8, 3, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 11, 7, 5, 10, 11, 1, 9, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1, -1, -1, -1, -1 },
+    { 11, 1, 2, 11, 7, 1, 7, 5, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11, -1, -1, -1, -1 },
+    { 9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7, -1, -1, -1, -1 },
+    { 7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2, -1 },
+    { 2, 5, 10, 2, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5, -1, -1, -1, -1 },
+    { 9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2, -1, -1, -1, -1 },
+    { 9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2, -1 },
+    { 1, 3, 5, 3, 7, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 7, 0, 7, 1, 1, 7, 5, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 0, 3, 9, 3, 5, 5, 3, 7, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 8, 7, 5, 9, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 8, 4, 5, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0, -1, -1, -1, -1 },
+    { 0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5, -1, -1, -1, -1 },
+    { 10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4, -1 },
+    { 2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8, -1, -1, -1, -1 },
+    { 0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11, -1 },
+    { 0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5, -1 },
+    { 9, 4, 5, 2, 11, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4, -1, -1, -1, -1 },
+    { 5, 10, 2, 5, 2, 4, 4, 2, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9, -1 },
+    { 5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2, -1, -1, -1, -1 },
+    { 8, 4, 5, 8, 5, 3, 3, 5, 1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 4, 5, 1, 0, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5, -1, -1, -1, -1 },
+    { 9, 4, 5, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 11, 7, 4, 9, 11, 9, 10, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11, -1, -1, -1, -1 },
+    { 1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11, -1, -1, -1, -1 },
+    { 3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4, -1 },
+    { 4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2, -1, -1, -1, -1 },
+    { 9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3, -1 },
+    { 11, 7, 4, 11, 4, 2, 2, 4, 0, -1, -1, -1, -1, -1, -1, -1 },
+    { 11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4, -1, -1, -1, -1 },
+    { 2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9, -1, -1, -1, -1 },
+    { 9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7, -1 },
+    { 3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10, -1 },
+    { 1, 10, 2, 8, 7, 4, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 9, 1, 4, 1, 7, 7, 1, 3, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1, -1, -1, -1, -1 },
+    { 4, 0, 3, 7, 4, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 4, 8, 7, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 10, 8, 10, 11, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 0, 9, 3, 9, 11, 11, 9, 10, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 1, 10, 0, 10, 8, 8, 10, 11, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 1, 10, 11, 3, 10, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 2, 11, 1, 11, 9, 9, 11, 8, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9, -1, -1, -1, -1 },
+    { 0, 2, 11, 8, 0, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 3, 2, 11, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 3, 8, 2, 8, 10, 10, 8, 9, -1, -1, -1, -1, -1, -1, -1 },
+    { 9, 10, 2, 0, 9, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8, -1, -1, -1, -1 },
+    { 1, 10, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 1, 3, 8, 9, 1, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 9, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { 0, 3, 8, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+    { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }
+};
 
 // number of vertices for each case above
 _CPU_AND_GPU_CONSTANT_ int marchingCubesNumVertsTable[256] = {
-        0,  3,  3,  6,  3,  6,  6,  9,  3,  6,  6,  9,  6,  9,  9,  6,  3,  6,  6,
-        9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  3,  6,  6,  9,  6,  9,
-        9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  6,  9,  12, 12, 9,  9,
-        12, 12, 9,  12, 15, 15, 6,  3,  6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12,
-        9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 12, 15, 15,
-        12, 6,  9,  9,  12, 9,  12, 6,  9,  9,  12, 12, 15, 12, 15, 9,  6,  9,  12,
-        12, 9,  12, 15, 9,  6,  12, 15, 15, 12, 15, 6,  12, 3,  3,  6,  6,  9,  6,
-        9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,  6,  9,  9,  12, 9,  12, 12, 15,
-        9,  6,  12, 9,  12, 9,  15, 6,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12,
-        15, 12, 15, 15, 12, 9,  12, 12, 9,  12, 15, 15, 12, 12, 9,  15, 6,  15, 12,
-        6,  3,  6,  9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 6,  9,  9,  6,  9,
-        12, 12, 15, 12, 15, 15, 6,  12, 9,  15, 12, 9,  6,  12, 3,  9,  12, 12, 15,
-        12, 15, 9,  12, 12, 15, 15, 6,  9,  12, 6,  3,  6,  9,  9,  6,  9,  12, 6,
-        3,  9,  6,  12, 3,  6,  3,  3,  0,
+    0,  3,  3,  6,  3,  6,  6,  9,  3, 6,  6,  9,  6,  9,  9,  6,  3,  6,  6,  9,  6,  9,  9,  12,
+    6,  9,  9,  12, 9,  12, 12, 9,  3, 6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,
+    6,  9,  9,  6,  9,  12, 12, 9,  9, 12, 12, 9,  12, 15, 15, 6,  3,  6,  6,  9,  6,  9,  9,  12,
+    6,  9,  9,  12, 9,  12, 12, 9,  6, 9,  9,  12, 9,  12, 12, 15, 9,  12, 12, 15, 12, 15, 15, 12,
+    6,  9,  9,  12, 9,  12, 6,  9,  9, 12, 12, 15, 12, 15, 9,  6,  9,  12, 12, 9,  12, 15, 9,  6,
+    12, 15, 15, 12, 15, 6,  12, 3,  3, 6,  6,  9,  6,  9,  9,  12, 6,  9,  9,  12, 9,  12, 12, 9,
+    6,  9,  9,  12, 9,  12, 12, 15, 9, 6,  12, 9,  12, 9,  15, 6,  6,  9,  9,  12, 9,  12, 12, 15,
+    9,  12, 12, 15, 12, 15, 15, 12, 9, 12, 12, 9,  12, 15, 15, 12, 12, 9,  15, 6,  15, 12, 6,  3,
+    6,  9,  9,  12, 9,  12, 12, 15, 9, 12, 12, 15, 6,  9,  9,  6,  9,  12, 12, 15, 12, 15, 15, 6,
+    12, 9,  15, 12, 9,  6,  12, 3,  9, 12, 12, 15, 12, 15, 9,  12, 12, 15, 15, 6,  9,  12, 6,  3,
+    6,  9,  9,  6,  9,  12, 6,  3,  9, 6,  12, 3,  6,  3,  3,  0,
 };
 
-_CPU_AND_GPU_CONSTANT_ int marchingCubesCubeRelTable[8][3] = {
-        {0, 0, 0}, {1, 0, 0}, {1, 1, 0}, {0, 1, 0},
-        {0, 0, 1}, {1, 0, 1}, {1, 1, 1}, {0, 1, 1}
-};
+_CPU_AND_GPU_CONSTANT_ int marchingCubesCubeRelTable[8][3] = { { 0, 0, 0 }, { 1, 0, 0 },
+                                                               { 1, 1, 0 }, { 0, 1, 0 },
+                                                               { 0, 0, 1 }, { 1, 0, 1 },
+                                                               { 1, 1, 1 }, { 0, 1, 1 } };
 
 _CPU_AND_GPU_CONSTANT_ int marchingCubesE2iTable[12][2] = {
-        {0, 1}, {1, 2}, {2, 3}, {0, 3}, {4, 5}, {5, 6}, {6, 7}, {7, 4}, {0, 4}, {1, 5}, {6, 2}, {3, 7}
+    { 0, 1 }, { 1, 2 }, { 2, 3 }, { 0, 3 }, { 4, 5 }, { 5, 6 },
+    { 6, 7 }, { 7, 4 }, { 0, 4 }, { 1, 5 }, { 6, 2 }, { 3, 7 }
 };
 
-
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // MARCHING_CUBES_TABLES_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/TrilinearInterpolationIterator.h b/fvdb/src/detail/utils/TrilinearInterpolationIterator.h
index 53be241022..eab13027d1 100644
--- a/fvdb/src/detail/utils/TrilinearInterpolationIterator.h
+++ b/fvdb/src/detail/utils/TrilinearInterpolationIterator.h
@@ -1,11 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONITERATOR_H
+#define FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONITERATOR_H
 
 #include <nanovdb/NanoVDB.h>
 
-
 namespace fvdb {
 namespace detail {
 
@@ -23,32 +23,31 @@ namespace detail {
  *   ijk is the coordinate of one of the neighboring voxel
  *   w_tril is the trilinear weight that the voxel ijk contributes
  */
-template <typename ScalarT>
-struct TrilinearInterpolationIterator {
+template <typename ScalarT> struct TrilinearInterpolationIterator {
     struct PairT {
         nanovdb::Coord first;
-        ScalarT second;
+        ScalarT        second;
     };
 
-    template <typename Scalar, int N>
-    struct ArrayT {
+    template <typename Scalar, int N> struct ArrayT {
         Scalar mData[N];
 
-        __hostdev__
-        constexpr Scalar operator [] (int i) const {return mData[i];}
+        __hostdev__ constexpr Scalar
+        operator[](int i) const {
+            return mData[i];
+        }
     };
 
     // Iterator traits, previously from std::iterator.
-    using value_type = PairT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = PairT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     TrilinearInterpolationIterator() = delete;
 
     __hostdev__
     TrilinearInterpolationIterator(const nanovdb::math::Vec3<ScalarT> p) {
-
         mCount = 0;
         mVoxel = p.floor();
         const nanovdb::math::Vec3<ScalarT> uvw =
@@ -66,65 +65,67 @@ struct TrilinearInterpolationIterator {
             uvw[0] * uvw[1] * uvw[2],
         };
 
-        mCoordAndWeight = {
-            mVoxel, mTrilinearWeights[0]
-        };
+        mCoordAndWeight = { mVoxel, mTrilinearWeights[0] };
     }
 
-    __hostdev__
-    inline const TrilinearInterpolationIterator& operator++() {
+    __hostdev__ inline const TrilinearInterpolationIterator &
+    operator++() {
         mCount += 1;
         if (mCount >= 8) {
             return *this;
         }
-        const uint8_t di = (mCount & (1 << 2)) >> 2;
-        const uint8_t dj = (mCount & (1 << 1)) >> 1;
-        const uint8_t dk = mCount & 1;
-        const nanovdb::Coord ijk = nanovdb::Coord(di, dj, dk) + mVoxel;
-        const ScalarT weight = mTrilinearWeights[mCount];
-        mCoordAndWeight = { ijk, weight };
+        const uint8_t        di     = (mCount & (1 << 2)) >> 2;
+        const uint8_t        dj     = (mCount & (1 << 1)) >> 1;
+        const uint8_t        dk     = mCount & 1;
+        const nanovdb::Coord ijk    = nanovdb::Coord(di, dj, dk) + mVoxel;
+        const ScalarT        weight = mTrilinearWeights[mCount];
+        mCoordAndWeight             = { ijk, weight };
         return *this;
     }
 
-    __hostdev__
-    TrilinearInterpolationIterator operator++(int) {
-        TrilinearInterpolationIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ TrilinearInterpolationIterator
+    operator++(int) {
+        TrilinearInterpolationIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Dereferencable.
-    __hostdev__
-    inline constexpr const value_type& operator*() const {
+    __hostdev__ inline constexpr const value_type &
+    operator*() const {
         return mCoordAndWeight;
     }
 
-    __hostdev__
-    inline constexpr const value_type* operator->() const {
-        return (const value_type*) &mCoordAndWeight;
+    __hostdev__ inline constexpr const value_type *
+    operator->() const {
+        return (const value_type *)&mCoordAndWeight;
     }
 
     // Equality / inequality.
-    __hostdev__
-    inline constexpr bool operator==(const TrilinearInterpolationIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator==(const TrilinearInterpolationIterator &rhs) const {
         return mVoxel == rhs.mVoxel && mCount == rhs.mCount;
     }
 
-    __hostdev__
-    inline constexpr bool operator!=(const TrilinearInterpolationIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator!=(const TrilinearInterpolationIterator &rhs) const {
         return !(*this == rhs);
     }
 
-    __hostdev__
-    inline constexpr bool isValid() {
+    __hostdev__ inline constexpr bool
+    isValid() {
         return mCount < 8;
     }
 
-private:
-    int32_t mCount = 0;
-    value_type mCoordAndWeight;
-    nanovdb::Coord mVoxel;
+  private:
+    int32_t            mCount = 0;
+    value_type         mCoordAndWeight;
+    nanovdb::Coord     mVoxel;
     ArrayT<ScalarT, 8> mTrilinearWeights;
     // std::array<ScalarT, 8> mTrilinearWeights;
 };
 
 } // namespace detail
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONITERATOR_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/TrilinearInterpolationWithGradIterator.h b/fvdb/src/detail/utils/TrilinearInterpolationWithGradIterator.h
index b0c8f7b3cb..ddff2de28a 100644
--- a/fvdb/src/detail/utils/TrilinearInterpolationWithGradIterator.h
+++ b/fvdb/src/detail/utils/TrilinearInterpolationWithGradIterator.h
@@ -1,138 +1,127 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONWITHGRADITERATOR_H
+#define FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONWITHGRADITERATOR_H
 
 #include <nanovdb/NanoVDB.h>
 
-
 namespace fvdb {
 namespace detail {
 
-template <typename ScalarT>
-struct TrilinearInterpolationWithGradIterator {
+template <typename ScalarT> struct TrilinearInterpolationWithGradIterator {
     struct PairT {
-        nanovdb::Coord first;
+        nanovdb::Coord               first;
         nanovdb::math::Vec4<ScalarT> second;
     };
 
-    template <typename Scalar, int N>
-    struct ArrayT {
+    template <typename Scalar, int N> struct ArrayT {
         nanovdb::math::Vec4<ScalarT> mData[N];
 
-        __hostdev__
-        constexpr nanovdb::math::Vec4<ScalarT> operator [] (int i) const {return mData[i];}
+        __hostdev__ constexpr nanovdb::math::Vec4<ScalarT>
+        operator[](int i) const {
+            return mData[i];
+        }
     };
 
     // Iterator traits, previously from std::iterator.
-    using value_type = PairT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = PairT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     TrilinearInterpolationWithGradIterator() = delete;
 
     __hostdev__
     TrilinearInterpolationWithGradIterator(const nanovdb::math::Vec3<ScalarT> p) {
-
         mCount = 0;
         mVoxel = p.floor();
         const nanovdb::math::Vec3<ScalarT> uvw =
             p - nanovdb::math::Vec3<ScalarT>(mVoxel.x(), mVoxel.y(), mVoxel.z());
 
         const ScalarT ONE = ScalarT(1);
-        mTrilinearWXYZ = {
-            nanovdb::math::Vec4<ScalarT>((ONE - uvw[0]) * (ONE - uvw[1]) * (ONE - uvw[2]),
-                                   -(ONE - uvw[1]) * (ONE - uvw[2]),
-                                   -(ONE - uvw[0]) * (ONE - uvw[2]),
-                                   -(ONE - uvw[0]) * (ONE - uvw[1])),
+        mTrilinearWXYZ    = {
+            nanovdb::math::Vec4<ScalarT>(
+                (ONE - uvw[0]) * (ONE - uvw[1]) * (ONE - uvw[2]), -(ONE - uvw[1]) * (ONE - uvw[2]),
+                -(ONE - uvw[0]) * (ONE - uvw[2]), -(ONE - uvw[0]) * (ONE - uvw[1])),
             nanovdb::math::Vec4<ScalarT>((ONE - uvw[0]) * (ONE - uvw[1]) * uvw[2],
-                                   -(ONE - uvw[1]) * uvw[2],
-                                   -(ONE - uvw[0]) * uvw[2],
-                                   (ONE - uvw[0]) * (ONE - uvw[1])),
+                                            -(ONE - uvw[1]) * uvw[2], -(ONE - uvw[0]) * uvw[2],
+                                            (ONE - uvw[0]) * (ONE - uvw[1])),
             nanovdb::math::Vec4<ScalarT>((ONE - uvw[0]) * uvw[1] * (ONE - uvw[2]),
-                                   -uvw[1] * (ONE - uvw[2]),
-                                   (ONE - uvw[0]) * (ONE - uvw[2]),
-                                   -(ONE - uvw[0]) * uvw[1]),
-            nanovdb::math::Vec4<ScalarT>((ONE - uvw[0]) * uvw[1] * uvw[2],
-                                   -uvw[1] * uvw[2],
-                                   (ONE - uvw[0]) * uvw[2],
-                                   (ONE - uvw[0]) * uvw[1]),
+                                            -uvw[1] * (ONE - uvw[2]), (ONE - uvw[0]) * (ONE - uvw[2]),
+                                            -(ONE - uvw[0]) * uvw[1]),
+            nanovdb::math::Vec4<ScalarT>((ONE - uvw[0]) * uvw[1] * uvw[2], -uvw[1] * uvw[2],
+                                            (ONE - uvw[0]) * uvw[2], (ONE - uvw[0]) * uvw[1]),
             nanovdb::math::Vec4<ScalarT>(uvw[0] * (ONE - uvw[1]) * (ONE - uvw[2]),
-                                   (ONE - uvw[1]) * (ONE - uvw[2]),
-                                   -uvw[0] * (ONE - uvw[2]),
-                                   -uvw[0] * (ONE - uvw[1])),
-            nanovdb::math::Vec4<ScalarT>(uvw[0] * (ONE - uvw[1]) * uvw[2],
-                                   (ONE - uvw[1]) * uvw[2],
-                                   -uvw[0] * uvw[2],
-                                   uvw[0] * (ONE - uvw[1])),
-            nanovdb::math::Vec4<ScalarT>(uvw[0] * uvw[1] * (ONE - uvw[2]),
-                                   uvw[1] * (ONE - uvw[2]),
-                                   uvw[0] * (ONE - uvw[2]),
-                                   -uvw[0] * uvw[1]),
-            nanovdb::math::Vec4<ScalarT>(uvw[0] * uvw[1] * uvw[2],
-                                   uvw[1] * uvw[2],
-                                   uvw[0] * uvw[2],
-                                   uvw[0] * uvw[1]),
+                                            (ONE - uvw[1]) * (ONE - uvw[2]), -uvw[0] * (ONE - uvw[2]),
+                                            -uvw[0] * (ONE - uvw[1])),
+            nanovdb::math::Vec4<ScalarT>(uvw[0] * (ONE - uvw[1]) * uvw[2], (ONE - uvw[1]) * uvw[2],
+                                            -uvw[0] * uvw[2], uvw[0] * (ONE - uvw[1])),
+            nanovdb::math::Vec4<ScalarT>(uvw[0] * uvw[1] * (ONE - uvw[2]), uvw[1] * (ONE - uvw[2]),
+                                            uvw[0] * (ONE - uvw[2]), -uvw[0] * uvw[1]),
+            nanovdb::math::Vec4<ScalarT>(uvw[0] * uvw[1] * uvw[2], uvw[1] * uvw[2], uvw[0] * uvw[2],
+                                            uvw[0] * uvw[1]),
         };
 
-        mCoordAndWXYZ = {
-                mVoxel, mTrilinearWXYZ[0]
-        };
+        mCoordAndWXYZ = { mVoxel, mTrilinearWXYZ[0] };
     }
 
-    __hostdev__
-    inline const TrilinearInterpolationWithGradIterator& operator++() {
+    __hostdev__ inline const TrilinearInterpolationWithGradIterator &
+    operator++() {
         mCount += 1;
         if (mCount >= 8) {
             return *this;
         }
-        const uint8_t di = (mCount & (1 << 2)) >> 2;
-        const uint8_t dj = (mCount & (1 << 1)) >> 1;
-        const uint8_t dk = mCount & 1;
+        const uint8_t        di  = (mCount & (1 << 2)) >> 2;
+        const uint8_t        dj  = (mCount & (1 << 1)) >> 1;
+        const uint8_t        dk  = mCount & 1;
         const nanovdb::Coord ijk = nanovdb::Coord(di, dj, dk) + mVoxel;
-        mCoordAndWXYZ = {ijk, mTrilinearWXYZ[mCount] };
+        mCoordAndWXYZ            = { ijk, mTrilinearWXYZ[mCount] };
         return *this;
     }
 
-    __hostdev__
-    TrilinearInterpolationWithGradIterator operator++(int) {
-        TrilinearInterpolationWithGradIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ TrilinearInterpolationWithGradIterator
+    operator++(int) {
+        TrilinearInterpolationWithGradIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Dereferencable.
-    __hostdev__
-    inline constexpr const value_type& operator*() const {
+    __hostdev__ inline constexpr const value_type &
+    operator*() const {
         return mCoordAndWXYZ;
     }
 
-    __hostdev__
-    inline constexpr const value_type* operator->() const {
-        return (const value_type*) &mCoordAndWXYZ;
+    __hostdev__ inline constexpr const value_type *
+    operator->() const {
+        return (const value_type *)&mCoordAndWXYZ;
     }
 
     // Equality / inequality.
-    __hostdev__
-    inline constexpr bool operator==(const TrilinearInterpolationWithGradIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator==(const TrilinearInterpolationWithGradIterator &rhs) const {
         return mVoxel == rhs.mVoxel && mCount == rhs.mCount;
     }
 
-    __hostdev__
-    inline constexpr bool operator!=(const TrilinearInterpolationWithGradIterator& rhs) const {
+    __hostdev__ inline constexpr bool
+    operator!=(const TrilinearInterpolationWithGradIterator &rhs) const {
         return !(*this == rhs);
     }
 
-    __hostdev__
-    inline constexpr bool isValid() {
+    __hostdev__ inline constexpr bool
+    isValid() {
         return mCount < 8;
     }
 
-private:
-    int32_t mCount = 0;
-    value_type mCoordAndWXYZ;
-    nanovdb::Coord mVoxel;
+  private:
+    int32_t            mCount = 0;
+    value_type         mCoordAndWXYZ;
+    nanovdb::Coord     mVoxel;
     ArrayT<ScalarT, 8> mTrilinearWXYZ;
 };
 
 } // namespace detail
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_TRILINEARINTERPOLATIONWITHGRADITERATOR_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/Utils.h b/fvdb/src/detail/utils/Utils.h
index 4367dc517d..724ab81d12 100644
--- a/fvdb/src/detail/utils/Utils.h
+++ b/fvdb/src/detail/utils/Utils.h
@@ -1,57 +1,56 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
-
-#include <nanovdb/NanoVDB.h>
-
-#include <type_traits>
-#include <iostream> // for std::ostream
-
-#include <c10/util/Half.h>
-#include <torch/extension.h>
+#ifndef FVDB_DETAIL_UTILS_UTILS_H
+#define FVDB_DETAIL_UTILS_UTILS_H
 
 #include "nanovdb/ActiveVoxelIterator.h"
-#include "nanovdb/HDDAIterators.h"
 #include "nanovdb/CustomAccessors.h"
+#include "nanovdb/HDDAIterators.h"
 #include "nanovdb/Printing.h"
 #include "nanovdb/TorchNanoConversions.h"
 
+#include <nanovdb/NanoVDB.h>
+
+#include <c10/util/Half.h>
+#include <torch/extension.h>
+
+#include <iostream> // for std::ostream
+#include <type_traits>
 
 // A bunch of things defined to make intellisense work with nvcc
 #if defined(NDEVELOP_IDE_ONLY)
 namespace torch {
-    template <typename T>
-    struct RestrictPtrTraits {
-    typedef T* __restrict__ PtrType;
-    };
-}
+template <typename T> struct RestrictPtrTraits {
+    typedef T *__restrict__ PtrType;
+};
+} // namespace torch
 #endif
 
-
 /// @brief Given a fvdb::GridBatchImpl, dispatch to the correct grid type.
-///        This macro calls the passed in function with the typedef GridType to the correct grid type.
-///        (i.e. ValueOnIndex or ValueOnIndexMask)
-#define FVDB_DISPATCH_GRID_TYPES(GRID_HDL, ...)           \
-    [&]() {                                               \
-        if ((GRID_HDL).isMutable()) {                     \
-            using GridType = nanovdb::ValueOnIndexMask;   \
-            return __VA_ARGS__();                         \
-        } else {                                          \
-            using GridType = nanovdb::ValueOnIndex;       \
-            return __VA_ARGS__();                         \
-        }                                                 \
+///        This macro calls the passed in function with the typedef GridType to the correct grid
+///        type. (i.e. ValueOnIndex or ValueOnIndexMask)
+#define FVDB_DISPATCH_GRID_TYPES(GRID_HDL, ...)         \
+    [&]() {                                             \
+        if ((GRID_HDL).isMutable()) {                   \
+            using GridType = nanovdb::ValueOnIndexMask; \
+            return __VA_ARGS__();                       \
+        } else {                                        \
+            using GridType = nanovdb::ValueOnIndex;     \
+            return __VA_ARGS__();                       \
+        }                                               \
     }()
 
 /// @brief Given a torch::Device, define DeviceTag to torch::kCPU or torch::kCUDA.
-///        This macro calls the passed in function with the typedef DeviceTag to the correct device tag.
+///        This macro calls the passed in function with the typedef DeviceTag to the correct device
+///        tag.
 #define FVDB_DISPATCH_KERNEL_DEVICE(DEVICE, ...)                           \
     [&]() {                                                                \
         if (DEVICE.is_cpu()) {                                             \
-            static constexpr c10::DeviceType DeviceTag = torch::kCPU;             \
+            static constexpr c10::DeviceType DeviceTag = torch::kCPU;      \
             return __VA_ARGS__();                                          \
         } else if (DEVICE.is_cuda()) {                                     \
-            static constexpr c10::DeviceType DeviceTag = torch::kCUDA;            \
+            static constexpr c10::DeviceType DeviceTag = torch::kCUDA;     \
             return __VA_ARGS__();                                          \
         } else {                                                           \
             TORCH_CHECK(false, "Only CUDA and CPU devices are supported"); \
@@ -59,8 +58,8 @@ namespace torch {
     }()
 
 /// @brief Given a boolean for whether a grid is mutable, dispatch to the correct grid type.
-///        This macro calls the passed in function with the typedef GridType to the correct grid type.
-///        (i.e. ValueOnIndex or ValueOnIndexMask)
+///        This macro calls the passed in function with the typedef GridType to the correct grid
+///        type. (i.e. ValueOnIndex or ValueOnIndexMask)
 #define FVDB_DISPATCH_GRID_TYPES_MUTABLE(IS_MUTABLE, ...) \
     [&]() {                                               \
         if (IS_MUTABLE) {                                 \
@@ -72,55 +71,56 @@ namespace torch {
         }                                                 \
     }()
 
-
-
 namespace fvdb {
 namespace detail {
 
-/// @brief A helper struct to determine if a type is a floating-point type or a half-precision floating-point type.
+/// @brief A helper struct to determine if a type is a floating-point type or a half-precision
+/// floating-point type.
 /// @tparam T The type to check.
-template< class T >
+template <class T>
 struct is_floating_point_or_half
-    : std::integral_constant<
-        bool,
-        // Note: standard floating-point types
-        std::is_same<float, typename std::remove_cv<T>::type>::value
-        || std::is_same<double, typename std::remove_cv<T>::type>::value
-        || std::is_same<long double, typename std::remove_cv<T>::type>::value
-        // Note: extended floating-point types (C++23, if supported)
-        || std::is_same<c10::Half, typename std::remove_cv<T>::type>::value
-    > {};
-
+    : std::integral_constant<bool,
+                             // Note: standard floating-point types
+                             std::is_same<float, typename std::remove_cv<T>::type>::value ||
+                                 std::is_same<double, typename std::remove_cv<T>::type>::value ||
+                                 std::is_same<long double, typename std::remove_cv<T>::type>::value
+                                 // Note: extended floating-point types (C++23, if supported)
+                                 ||
+                                 std::is_same<c10::Half, typename std::remove_cv<T>::type>::value> {
+};
 
 /// @brief Convert a 1d tensor of integer values into an std:vector<int64_t>
 /// @param shapeTensor a 1D tensor of integer values
 /// @return An std::vector<int64_t> with the same values as the input tensor
-inline std::vector<int64_t> intTensor1DToStdVector(torch::Tensor shapeTensor) {
+inline std::vector<int64_t>
+intTensor1DToStdVector(torch::Tensor shapeTensor) {
     return AT_DISPATCH_INTEGRAL_TYPES(shapeTensor.scalar_type(), "tensorToShape", [&]() {
         TORCH_CHECK(shapeTensor.dim() == 1, "shapeTensor must be a 1D tensor");
         TORCH_CHECK(!shapeTensor.is_floating_point(), "shapeTensor must be an integer tensor");
-        auto acc = shapeTensor.accessor<scalar_t, 1>();
+        auto                 acc = shapeTensor.accessor<scalar_t, 1>();
         std::vector<int64_t> outShape(acc.size(0));
         for (int64_t i = 0; i < acc.size(0); i += 1) {
-            outShape[i] = (int64_t) acc[i];
+            outShape[i] = (int64_t)acc[i];
         }
         return outShape;
     });
 }
 
-
-/// @brief Return an std::vector<int64_t> representing the shape of a tensor which is forned by removing the first
+/// @brief Return an std::vector<int64_t> representing the shape of a tensor which is forned by
+/// removing the first
 ///        N dimensions of the input tensor and replacing them with s0
-///        For example lets' say inTensor has shape [X, Y, Z], then spliceShape({A, B}, inTensor, 2) will return [A, B, Z]
-///        and spliceShape({A, B}, inTensor, 1) will return [A, B, Y, Z]
+///        For example lets' say inTensor has shape [X, Y, Z], then spliceShape({A, B}, inTensor, 2)
+///        will return [A, B, Z] and spliceShape({A, B}, inTensor, 1) will return [A, B, Y, Z]
 /// @param s0 The shape values to splice in
 /// @param inTensor The tensor whose shape to splice
 /// @param start How many dimensions to remove from the shape of inTensor
 /// @return An std::vector<int64_t> representing the shape of the spliced tensor
-inline std::vector<int64_t> spliceShape(std::vector<int64_t> s0, const torch::Tensor& inTensor, int start = 1) {
-    TORCH_CHECK(start >= 0 && start <= inTensor.dim(), "start must be in range [0, inTensor.dim()]");
+inline std::vector<int64_t>
+spliceShape(std::vector<int64_t> s0, const torch::Tensor &inTensor, int start = 1) {
+    TORCH_CHECK(start >= 0 && start <= inTensor.dim(),
+                "start must be in range [0, inTensor.dim()]");
     std::vector<int64_t> outSize(s0.size() + inTensor.dim() - start);
-    for (size_t i = 0; i < s0.size(); i+= 1) {
+    for (size_t i = 0; i < s0.size(); i += 1) {
         outSize[i] = s0[i];
     }
 
@@ -130,13 +130,15 @@ inline std::vector<int64_t> spliceShape(std::vector<int64_t> s0, const torch::Te
     return outSize;
 }
 
-
-/// @brief Return a view of the input tensor with all but first ndim dimensions coalesced into a single dimension
+/// @brief Return a view of the input tensor with all but first ndim dimensions coalesced into a
+/// single dimension
 ///        this is similar to inTensor.view({inTensor.size(0), ..., inTensor.size(ndim - 1), -1})
 ///        but it handles the case when inTensor.size(0)*...*inTensor.size(ndim - 1) == 0
 /// @param inTensor The tensor to coalesce
-/// @return A view of the input tensor with all but first dimensions coalesced into a single dimension
-inline torch::Tensor featureCoalescedView(const torch::Tensor& inTensor, int64_t ndim = 1) {
+/// @return A view of the input tensor with all but first dimensions coalesced into a single
+/// dimension
+inline torch::Tensor
+featureCoalescedView(const torch::Tensor &inTensor, int64_t ndim = 1) {
     std::vector<int64_t> outSize;
     for (int64_t i = 0; i < ndim; ++i) {
         outSize.push_back(inTensor.size(i));
@@ -151,24 +153,32 @@ inline torch::Tensor featureCoalescedView(const torch::Tensor& inTensor, int64_t
     return outTensor;
 }
 
-
-/// @brief Convert a tensor of shape [B, 3] or [3] representing a batch of coordinates or a single coordinate into a
+/// @brief Convert a tensor of shape [B, 3] or [3] representing a batch of coordinates or a single
+/// coordinate into a
 ///        tensor of shape [B, 3] (if the input has shape [B, 3], this is a no-op)
 /// @param coordOrBatch A tensor of shape [B, 3] or [3]
 /// @param batchSize The size of the batch
 /// @return A tensor of shape [B, 3]
-inline torch::Tensor coordTensorToBatch(const torch::Tensor& coordOrBatch, int64_t batchSize) {
+inline torch::Tensor
+coordTensorToBatch(const torch::Tensor &coordOrBatch, int64_t batchSize) {
     if (coordOrBatch.dim() == 1) {
-        TORCH_CHECK_VALUE(coordOrBatch.size(0) == 3, "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
-                          std::to_string(coordOrBatch.size(0)) + ",]");
-        return coordOrBatch.unsqueeze(0).repeat({batchSize, 1});
+        TORCH_CHECK_VALUE(coordOrBatch.size(0) == 3,
+                          "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
+                              std::to_string(coordOrBatch.size(0)) + ",]");
+        return coordOrBatch.unsqueeze(0).repeat({ batchSize, 1 });
     } else {
-        TORCH_CHECK_VALUE(coordOrBatch.dim() == 2, "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
-                          std::to_string(coordOrBatch.size(0)) + ", " + std::to_string(coordOrBatch.size(1)) + "]");
-        TORCH_CHECK_VALUE(coordOrBatch.size(0) == batchSize, "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
-                          std::to_string(coordOrBatch.size(0)) + ", " + std::to_string(coordOrBatch.size(1)) + "]");
-        TORCH_CHECK_VALUE(coordOrBatch.size(1) == 3, "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
-                          std::to_string(coordOrBatch.size(0)) + ", " + std::to_string(coordOrBatch.size(1)) + "]");
+        TORCH_CHECK_VALUE(coordOrBatch.dim() == 2,
+                          "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
+                              std::to_string(coordOrBatch.size(0)) + ", " +
+                              std::to_string(coordOrBatch.size(1)) + "]");
+        TORCH_CHECK_VALUE(coordOrBatch.size(0) == batchSize,
+                          "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
+                              std::to_string(coordOrBatch.size(0)) + ", " +
+                              std::to_string(coordOrBatch.size(1)) + "]");
+        TORCH_CHECK_VALUE(coordOrBatch.size(1) == 3,
+                          "Expected coordOrBatch to have shape [3,] or [B, 3] but got shape = [" +
+                              std::to_string(coordOrBatch.size(0)) + ", " +
+                              std::to_string(coordOrBatch.size(1)) + "]");
         return coordOrBatch;
     }
 }
@@ -176,63 +186,66 @@ inline torch::Tensor coordTensorToBatch(const torch::Tensor& coordOrBatch, int64
 /// @brief Convert a torch::ScalarType to its corresponding string (e.g. torch::kInt32 -> "int32")
 /// @param stype The scalar type to convert to a string
 /// @return The string representation of the scalar type
-inline const std::string TorchScalarTypeToStr(torch::ScalarType stype) {
+inline const std::string
+TorchScalarTypeToStr(torch::ScalarType stype) {
     switch (stype) {
-        case torch::kInt8:
-            return "int8";
-        case torch::kUInt8:
-            return "uint8";
-        case torch::kInt16:
-            return "int16";
-        case torch::kInt32:
-            return "int32";
-        case torch::kInt64:
-            return "int64";
-
-        case torch::kFloat16:
-            return "float16";
-        case torch::kFloat32:
-            return "float32";
-        case torch::kFloat64:
-            return "float64";
-
-        case torch::kComplexHalf:
-            return "complex32";
-        case torch::kComplexFloat:
-            return "complex64";
-        case torch::kComplexDouble:
-            return "complex128";
-
-        case torch::kBool:
-            return "bool";
-
-        case torch::kQInt8:
-            return "qint8";
-        case torch::kQUInt8:
-            return "quint8";
-        case torch::kQInt32:
-            return "qint32";
-
-        case torch::kBFloat16:
-            return "bfloat16";
-
-        case torch::kQUInt2x4:
-            return "quint2x4";
-        case torch::kQUInt4x2:
-            return "quint4x2";
-
-        default:
-            TORCH_CHECK_VALUE(false, "Unsupported scalar type");
-
-        //Byte, Char, Short, Int, Long, Half, Float, Double, ComplexHalf, ComplexFloat, ComplexDouble, Bool, QInt8, QUInt8, QInt32, BFloat16, QUInt4x2, QUInt2x4
+    case torch::kInt8:
+        return "int8";
+    case torch::kUInt8:
+        return "uint8";
+    case torch::kInt16:
+        return "int16";
+    case torch::kInt32:
+        return "int32";
+    case torch::kInt64:
+        return "int64";
+
+    case torch::kFloat16:
+        return "float16";
+    case torch::kFloat32:
+        return "float32";
+    case torch::kFloat64:
+        return "float64";
+
+    case torch::kComplexHalf:
+        return "complex32";
+    case torch::kComplexFloat:
+        return "complex64";
+    case torch::kComplexDouble:
+        return "complex128";
+
+    case torch::kBool:
+        return "bool";
+
+    case torch::kQInt8:
+        return "qint8";
+    case torch::kQUInt8:
+        return "quint8";
+    case torch::kQInt32:
+        return "qint32";
+
+    case torch::kBFloat16:
+        return "bfloat16";
+
+    case torch::kQUInt2x4:
+        return "quint2x4";
+    case torch::kQUInt4x2:
+        return "quint4x2";
+
+    default:
+        TORCH_CHECK_VALUE(false, "Unsupported scalar type");
+
+        // Byte, Char, Short, Int, Long, Half, Float, Double, ComplexHalf, ComplexFloat,
+        // ComplexDouble, Bool, QInt8, QUInt8, QInt32, BFloat16, QUInt4x2, QUInt2x4
     }
 }
 
-
-/// @brief Convert a string representation of a scalar type into a torch::ScalarType (or throw an exception if the type is not valid)
+/// @brief Convert a string representation of a scalar type into a torch::ScalarType (or throw an
+/// exception if the type is not valid)
 /// @param dtypeStr The string representation of the scalar type
 /// @return The torch::ScalarType corresponding to the string
-inline torch::ScalarType StringToTorchScalarType(std::string dtypeStr) {
+inline torch::ScalarType
+StringToTorchScalarType(std::string dtypeStr) {
     if (dtypeStr == "int8") {
         return torch::kInt8;
     } else if (dtypeStr == "uint8") {
@@ -261,7 +274,7 @@ inline torch::ScalarType StringToTorchScalarType(std::string dtypeStr) {
         return torch::kComplexDouble;
     }
 
-    else if(dtypeStr == "bool") {
+    else if (dtypeStr == "bool") {
         return torch::kBool;
     }
 
@@ -280,16 +293,14 @@ inline torch::ScalarType StringToTorchScalarType(std::string dtypeStr) {
     }
 
     TORCH_CHECK(false, "Invalid dtype string " + dtypeStr);
-
 }
 
-
 } // namespace detail
 } // namespace fvdb
 
-
 // std::cout and std::cerr for shapes
-inline std::ostream& operator<<(std::ostream& os, at::IntArrayRef c) {
+inline std::ostream &
+operator<<(std::ostream &os, at::IntArrayRef c) {
     os << "[";
     for (size_t i = 0; i < c.size(); i += 1) {
         os << c[i];
@@ -299,4 +310,6 @@ inline std::ostream& operator<<(std::ostream& os, at::IntArrayRef c) {
     }
     os << "]";
     return os;
-}
\ No newline at end of file
+}
+
+#endif // FVDB_DETAIL_UTILS_UTILS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/cuda/Atomics.cuh b/fvdb/src/detail/utils/cuda/Atomics.cuh
index 58951f12ee..373f9e7734 100644
--- a/fvdb/src/detail/utils/cuda/Atomics.cuh
+++ b/fvdb/src/detail/utils/cuda/Atomics.cuh
@@ -1,307 +1,361 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_CUDA_ATOMICS_CUH
+#define FVDB_DETAIL_UTILS_CUDA_ATOMICS_CUH
 
-#define ATOMIC(NAME)                                                           \
-  template <typename scalar, size_t size> struct Atomic##NAME##IntegerImpl;    \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 1> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui = (uint32_t *)(address - ((size_t)address & 3)); \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t shift = ((size_t)address & 3) * 8;                              \
-      uint32_t sum;                                                            \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        sum = OP(val, scalar((old >> shift) & 0xff));                          \
-        old = (old & ~(0x000000ff << shift)) | (sum << shift);                 \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 2> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui =                                                \
-          (uint32_t *)((char *)address - ((size_t)address & 2));               \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t sum;                                                            \
-      uint32_t newval;                                                         \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        sum = OP(val, (size_t)address & 2 ? scalar(old >> 16)                  \
-                                          : scalar(old & 0xffff));             \
-        newval = (size_t)address & 2 ? (old & 0xffff) | (sum << 16)            \
-                                     : (old & 0xffff0000) | sum;               \
-        old = atomicCAS(address_as_ui, assumed, newval);                       \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 4> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      uint32_t *address_as_ui = (uint32_t *)address;                           \
-      uint32_t old = *address_as_ui;                                           \
-      uint32_t assumed;                                                        \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_ui, assumed, OP(val, (scalar)old));         \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 8> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      unsigned long long *address_as_ull = (unsigned long long *)address;      \
-      unsigned long long old = *address_as_ull;                                \
-      unsigned long long assumed;                                              \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_ull, assumed, OP(val, (scalar)old));        \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl;    \
-                                                                               \
-  template <> struct Atomic##NAME##DecimalImpl<at::Half, 2> {                  \
-    inline __device__ void operator()(at::Half *address, at::Half val) {       \
-      unsigned int *address_as_ui =                                            \
-          (unsigned int *)((char *)address - ((size_t)address & 2));           \
-      unsigned int old = *address_as_ui;                                       \
-      unsigned int assumed;                                                    \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        at::Half hsum;                                                         \
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \
-        hsum = OP(hsum, val);                                                  \
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \
-                                  : (old & 0xffff0000) | hsum.x;               \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <> struct Atomic##NAME##DecimalImpl<at::BFloat16, 2> {              \
-    inline __device__ void operator()(at::BFloat16 *address, at::BFloat16 val){\
-      unsigned int *address_as_ui =                                            \
-          (unsigned int *)((char *)address - ((size_t)address & 2));           \
-      unsigned int old = *address_as_ui;                                       \
-      unsigned int assumed;                                                    \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        at::BFloat16 hsum;                                                     \
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);           \
-        hsum = OP(hsum, val);                                                  \
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)            \
-                                  : (old & 0xffff0000) | hsum.x;               \
-        old = atomicCAS(address_as_ui, assumed, old);                          \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      int *address_as_i = (int *)address;                                      \
-      int old = *address_as_i;                                                 \
-      int assumed;                                                             \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(address_as_i, assumed,                                 \
-                        __float_as_int(OP(val, __int_as_float(assumed))));     \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };                                                                           \
-                                                                               \
-  template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 8> {     \
-    inline __device__ void operator()(scalar *address, scalar val) {           \
-      unsigned long long int *address_as_ull =                                 \
-          (unsigned long long int *)address;                                   \
-      unsigned long long int old = *address_as_ull;                            \
-      unsigned long long int assumed;                                          \
-                                                                               \
-      do {                                                                     \
-        assumed = old;                                                         \
-        old = atomicCAS(                                                       \
-            address_as_ull, assumed,                                           \
-            __double_as_longlong(OP(val, __longlong_as_double(assumed))));     \
-      } while (assumed != old);                                                \
-    }                                                                          \
-  };
+#define ATOMIC(NAME)                                                                               \
+    template <typename scalar, size_t size> struct Atomic##NAME##IntegerImpl;                      \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 1> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            uint32_t *address_as_ui = (uint32_t *)(address - ((size_t)address & 3));               \
+            uint32_t  old           = *address_as_ui;                                              \
+            uint32_t  shift         = ((size_t)address & 3) * 8;                                   \
+            uint32_t  sum;                                                                         \
+            uint32_t  assumed;                                                                     \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                sum     = OP(val, scalar((old >> shift) & 0xff));                                  \
+                old     = (old & ~(0x000000ff << shift)) | (sum << shift);                         \
+                old     = atomicCAS(address_as_ui, assumed, old);                                  \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 2> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            uint32_t *address_as_ui = (uint32_t *)((char *)address - ((size_t)address & 2));       \
+            uint32_t  old           = *address_as_ui;                                              \
+            uint32_t  sum;                                                                         \
+            uint32_t  newval;                                                                      \
+            uint32_t  assumed;                                                                     \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                sum     = OP(val, (size_t)address & 2 ? scalar(old >> 16) : scalar(old & 0xffff)); \
+                newval =                                                                           \
+                    (size_t)address & 2 ? (old & 0xffff) | (sum << 16) : (old & 0xffff0000) | sum; \
+                old = atomicCAS(address_as_ui, assumed, newval);                                   \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 4> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            uint32_t *address_as_ui = (uint32_t *)address;                                         \
+            uint32_t  old           = *address_as_ui;                                              \
+            uint32_t  assumed;                                                                     \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                old     = atomicCAS(address_as_ui, assumed, OP(val, (scalar)old));                 \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##IntegerImpl<scalar, 8> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            unsigned long long *address_as_ull = (unsigned long long *)address;                    \
+            unsigned long long  old            = *address_as_ull;                                  \
+            unsigned long long  assumed;                                                           \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                old     = atomicCAS(address_as_ull, assumed, OP(val, (scalar)old));                \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar, size_t size> struct Atomic##NAME##DecimalImpl;                      \
+                                                                                                   \
+    template <> struct Atomic##NAME##DecimalImpl<at::Half, 2> {                                    \
+        inline __device__ void                                                                     \
+        operator()(at::Half *address, at::Half val) {                                              \
+            unsigned int *address_as_ui =                                                          \
+                (unsigned int *)((char *)address - ((size_t)address & 2));                         \
+            unsigned int old = *address_as_ui;                                                     \
+            unsigned int assumed;                                                                  \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                at::Half hsum;                                                                     \
+                hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);                       \
+                hsum   = OP(hsum, val);                                                            \
+                old    = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)                     \
+                                             : (old & 0xffff0000) | hsum.x;                        \
+                old    = atomicCAS(address_as_ui, assumed, old);                                   \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <> struct Atomic##NAME##DecimalImpl<at::BFloat16, 2> {                                \
+        inline __device__ void                                                                     \
+        operator()(at::BFloat16 *address, at::BFloat16 val) {                                      \
+            unsigned int *address_as_ui =                                                          \
+                (unsigned int *)((char *)address - ((size_t)address & 2));                         \
+            unsigned int old = *address_as_ui;                                                     \
+            unsigned int assumed;                                                                  \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                at::BFloat16 hsum;                                                                 \
+                hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);                       \
+                hsum   = OP(hsum, val);                                                            \
+                old    = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16)                     \
+                                             : (old & 0xffff0000) | hsum.x;                        \
+                old    = atomicCAS(address_as_ui, assumed, old);                                   \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 4> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            int *address_as_i = (int *)address;                                                    \
+            int  old          = *address_as_i;                                                     \
+            int  assumed;                                                                          \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                old     = atomicCAS(address_as_i, assumed,                                         \
+                                    __float_as_int(OP(val, __int_as_float(assumed))));             \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };                                                                                             \
+                                                                                                   \
+    template <typename scalar> struct Atomic##NAME##DecimalImpl<scalar, 8> {                       \
+        inline __device__ void                                                                     \
+        operator()(scalar *address, scalar val) {                                                  \
+            unsigned long long int *address_as_ull = (unsigned long long int *)address;            \
+            unsigned long long int  old            = *address_as_ull;                              \
+            unsigned long long int  assumed;                                                       \
+                                                                                                   \
+            do {                                                                                   \
+                assumed = old;                                                                     \
+                old     = atomicCAS(address_as_ull, assumed,                                       \
+                                    __double_as_longlong(OP(val, __longlong_as_double(assumed)))); \
+            } while (assumed != old);                                                              \
+        }                                                                                          \
+    };
 
 #define OP(X, Y) Y + X
 ATOMIC(Add)
 #undef OP
-static inline __device__ void atomAdd(uint8_t *address, uint8_t val) {
-  AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+static inline __device__ void
+atomAdd(uint8_t *address, uint8_t val) {
+    AtomicAddIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }
-static inline __device__ void atomAdd(int8_t *address, int8_t val) {
-  AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+static inline __device__ void
+atomAdd(int8_t *address, int8_t val) {
+    AtomicAddIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }
-static inline __device__ void atomAdd(int16_t *address, int16_t val) {
-  AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+static inline __device__ void
+atomAdd(int16_t *address, int16_t val) {
+    AtomicAddIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }
-static inline __device__ void atomAdd(int32_t *address, int32_t val) {
-  atomicAdd(address, val);
+static inline __device__ void
+atomAdd(int32_t *address, int32_t val) {
+    atomicAdd(address, val);
 }
-static inline __device__ void atomAdd(int64_t *address, int64_t val) {
-  AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+static inline __device__ void
+atomAdd(int64_t *address, int64_t val) {
+    AtomicAddIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
 #if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700 || CUDA_VERSION < 10000))
-static inline __device__ void atomAdd(at::Half *address, at::Half val) {
-  AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+static inline __device__ void
+atomAdd(at::Half *address, at::Half val) {
+    AtomicAddDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
 #else
-static inline __device__ void atomAdd(at::Half *address, at::Half val) {
-  atomicAdd(reinterpret_cast<__half *>(address), val);
+static inline __device__ void
+atomAdd(at::Half *address, at::Half val) {
+    atomicAdd(reinterpret_cast<__half *>(address), val);
 }
 #endif
-static inline __device__ void atomAdd(float *address, float val) {
-  atomicAdd(address, val);
+static inline __device__ void
+atomAdd(float *address, float val) {
+    atomicAdd(address, val);
 }
 #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600 || CUDA_VERSION < 8000)
-static inline __device__ void atomAdd(double *address, double val) {
-  AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
+static inline __device__ void
+atomAdd(double *address, double val) {
+    AtomicAddDecimalImpl<double, sizeof(double)>()(address, val);
 }
 #else
-static inline __device__ void atomAdd(double *address, double val) {
-  atomicAdd(address, val);
+static inline __device__ void
+atomAdd(double *address, double val) {
+    atomicAdd(address, val);
 }
 #endif
-static inline __device__ void atomAdd(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicAddDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+static inline __device__ void
+atomAdd(at::BFloat16 *address, at::BFloat16 val) {
+    AtomicAddDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
 }
 
 #define OP(X, Y) Y *X
 ATOMIC(Mul)
 #undef OP
-static inline __device__ void atomMul(uint8_t *address, uint8_t val) {
-  AtomicMulIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+static inline __device__ void
+atomMul(uint8_t *address, uint8_t val) {
+    AtomicMulIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }
-static inline __device__ void atomMul(int8_t *address, int8_t val) {
-  AtomicMulIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+static inline __device__ void
+atomMul(int8_t *address, int8_t val) {
+    AtomicMulIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }
-static inline __device__ void atomMul(int16_t *address, int16_t val) {
-  AtomicMulIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+static inline __device__ void
+atomMul(int16_t *address, int16_t val) {
+    AtomicMulIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }
-static inline __device__ void atomMul(int32_t *address, int32_t val) {
-  AtomicMulIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
+static inline __device__ void
+atomMul(int32_t *address, int32_t val) {
+    AtomicMulIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
 }
-static inline __device__ void atomMul(int64_t *address, int64_t val) {
-  AtomicMulIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+static inline __device__ void
+atomMul(int64_t *address, int64_t val) {
+    AtomicMulIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-static inline __device__ void atomMul(float *address, float val) {
-  AtomicMulDecimalImpl<float, sizeof(float)>()(address, val);
+static inline __device__ void
+atomMul(float *address, float val) {
+    AtomicMulDecimalImpl<float, sizeof(float)>()(address, val);
 }
-static inline __device__ void atomMul(at::Half *address, at::Half val) {
-  AtomicMulDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+static inline __device__ void
+atomMul(at::Half *address, at::Half val) {
+    AtomicMulDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
-static inline __device__ void atomMul(double *address, double val) {
-  AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);
+static inline __device__ void
+atomMul(double *address, double val) {
+    AtomicMulDecimalImpl<double, sizeof(double)>()(address, val);
 }
-static inline __device__ void atomMul(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMulDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+static inline __device__ void
+atomMul(at::BFloat16 *address, at::BFloat16 val) {
+    AtomicMulDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
 }
 
 #define OP(X, Y) Y / X
 ATOMIC(Div)
 #undef OP
-static inline __device__ void atomDiv(uint8_t *address, uint8_t val) {
-  AtomicDivIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+static inline __device__ void
+atomDiv(uint8_t *address, uint8_t val) {
+    AtomicDivIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }
-static inline __device__ void atomDiv(int8_t *address, int8_t val) {
-  AtomicDivIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+static inline __device__ void
+atomDiv(int8_t *address, int8_t val) {
+    AtomicDivIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }
-static inline __device__ void atomDiv(int16_t *address, int16_t val) {
-  AtomicDivIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+static inline __device__ void
+atomDiv(int16_t *address, int16_t val) {
+    AtomicDivIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }
-static inline __device__ void atomDiv(int32_t *address, int32_t val) {
-  AtomicDivIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
+static inline __device__ void
+atomDiv(int32_t *address, int32_t val) {
+    AtomicDivIntegerImpl<int32_t, sizeof(int32_t)>()(address, val);
 }
-static inline __device__ void atomDiv(int64_t *address, int64_t val) {
-  AtomicDivIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+static inline __device__ void
+atomDiv(int64_t *address, int64_t val) {
+    AtomicDivIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-static inline __device__ void atomDiv(at::Half *address, at::Half val) {
-  AtomicDivDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+static inline __device__ void
+atomDiv(at::Half *address, at::Half val) {
+    AtomicDivDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
-static inline __device__ void atomDiv(float *address, float val) {
-  AtomicDivDecimalImpl<float, sizeof(float)>()(address, val);
+static inline __device__ void
+atomDiv(float *address, float val) {
+    AtomicDivDecimalImpl<float, sizeof(float)>()(address, val);
 }
-static inline __device__ void atomDiv(double *address, double val) {
-  AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);
+static inline __device__ void
+atomDiv(double *address, double val) {
+    AtomicDivDecimalImpl<double, sizeof(double)>()(address, val);
 }
-static inline __device__ void atomDiv(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicDivDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+static inline __device__ void
+atomDiv(at::BFloat16 *address, at::BFloat16 val) {
+    AtomicDivDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
 }
 
 #define OP(X, Y) max(Y, X)
 ATOMIC(Max)
 #undef OP
-static inline __device__ void atomMax(uint8_t *address, uint8_t val) {
-  AtomicMaxIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+static inline __device__ void
+atomMax(uint8_t *address, uint8_t val) {
+    AtomicMaxIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }
-static inline __device__ void atomMax(int8_t *address, int8_t val) {
-  AtomicMaxIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+static inline __device__ void
+atomMax(int8_t *address, int8_t val) {
+    AtomicMaxIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }
-static inline __device__ void atomMax(int16_t *address, int16_t val) {
-  AtomicMaxIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+static inline __device__ void
+atomMax(int16_t *address, int16_t val) {
+    AtomicMaxIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }
-static inline __device__ void atomMax(int32_t *address, int32_t val) {
-  atomicMax(address, val);
+static inline __device__ void
+atomMax(int32_t *address, int32_t val) {
+    atomicMax(address, val);
 }
-static inline __device__ void atomMax(int64_t *address, int64_t val) {
-  AtomicMaxIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+static inline __device__ void
+atomMax(int64_t *address, int64_t val) {
+    AtomicMaxIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-static inline __device__ void atomMax(at::Half *address, at::Half val) {
-  AtomicMaxDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+static inline __device__ void
+atomMax(at::Half *address, at::Half val) {
+    AtomicMaxDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
-static inline __device__ void atomMax(float *address, float val) {
-  AtomicMaxDecimalImpl<float, sizeof(float)>()(address, val);
+static inline __device__ void
+atomMax(float *address, float val) {
+    AtomicMaxDecimalImpl<float, sizeof(float)>()(address, val);
 }
-static inline __device__ void atomMax(double *address, double val) {
-  AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);
+static inline __device__ void
+atomMax(double *address, double val) {
+    AtomicMaxDecimalImpl<double, sizeof(double)>()(address, val);
 }
-static inline __device__ void atomMax(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMaxDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+static inline __device__ void
+atomMax(at::BFloat16 *address, at::BFloat16 val) {
+    AtomicMaxDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
 }
 
 #define OP(X, Y) min(Y, X)
 ATOMIC(Min)
 #undef OP
-static inline __device__ void atomMin(uint8_t *address, uint8_t val) {
-  AtomicMinIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
+static inline __device__ void
+atomMin(uint8_t *address, uint8_t val) {
+    AtomicMinIntegerImpl<uint8_t, sizeof(uint8_t)>()(address, val);
 }
-static inline __device__ void atomMin(int8_t *address, int8_t val) {
-  AtomicMinIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
+static inline __device__ void
+atomMin(int8_t *address, int8_t val) {
+    AtomicMinIntegerImpl<int8_t, sizeof(int8_t)>()(address, val);
 }
-static inline __device__ void atomMin(int16_t *address, int16_t val) {
-  AtomicMinIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
+static inline __device__ void
+atomMin(int16_t *address, int16_t val) {
+    AtomicMinIntegerImpl<int16_t, sizeof(int16_t)>()(address, val);
 }
-static inline __device__ void atomMin(int32_t *address, int32_t val) {
-  atomicMin(address, val);
+static inline __device__ void
+atomMin(int32_t *address, int32_t val) {
+    atomicMin(address, val);
 }
-static inline __device__ void atomMin(int64_t *address, int64_t val) {
-  AtomicMinIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
+static inline __device__ void
+atomMin(int64_t *address, int64_t val) {
+    AtomicMinIntegerImpl<int64_t, sizeof(int64_t)>()(address, val);
 }
-static inline __device__ void atomMin(at::Half *address, at::Half val) {
-  AtomicMinDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
+static inline __device__ void
+atomMin(at::Half *address, at::Half val) {
+    AtomicMinDecimalImpl<at::Half, sizeof(at::Half)>()(address, val);
 }
-static inline __device__ void atomMin(float *address, float val) {
-  AtomicMinDecimalImpl<float, sizeof(float)>()(address, val);
+static inline __device__ void
+atomMin(float *address, float val) {
+    AtomicMinDecimalImpl<float, sizeof(float)>()(address, val);
 }
-static inline __device__ void atomMin(double *address, double val) {
-  AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);
+static inline __device__ void
+atomMin(double *address, double val) {
+    AtomicMinDecimalImpl<double, sizeof(double)>()(address, val);
 }
-static inline __device__ void atomMin(at::BFloat16 *address, at::BFloat16 val) {
-  AtomicMinDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
+static inline __device__ void
+atomMin(at::BFloat16 *address, at::BFloat16 val) {
+    AtomicMinDecimalImpl<at::BFloat16, sizeof(at::BFloat16)>()(address, val);
 }
+
+#endif // FVDB_DETAIL_UTILS_CUDA_ATOMICS_CUH
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/cuda/Utils.cuh b/fvdb/src/detail/utils/cuda/Utils.cuh
index 8ecb4591fa..eaf3974977 100644
--- a/fvdb/src/detail/utils/cuda/Utils.cuh
+++ b/fvdb/src/detail/utils/cuda/Utils.cuh
@@ -1,18 +1,20 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#ifndef FVDB_DETAIL_UTILS_CUDA_UTILS_CUH
+#define FVDB_DETAIL_UTILS_CUDA_UTILS_CUH
+
+#include <Config.h>
+#include <JaggedTensor.h>
+#include <detail/GridBatchImpl.h>
+#include <detail/ops/Ops.h>
+
 #include <nanovdb/NanoVDB.h>
 
-#include <c10/cuda/CUDAException.h>
+#include <c10/core/DeviceType.h>
 #include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAException.h>
 #include <c10/cuda/CUDAGuard.h>
-#include <c10/core/DeviceType.h>
-
-#include "detail/GridBatchImpl.h"
-#include "detail/ops/Ops.h"
-#include "JaggedTensor.h"
-#include "Config.h"
-
 
 namespace fvdb {
 
@@ -34,26 +36,25 @@ using TorchRAcc64 = torch::PackedTensorAccessor64<ScalarType, DIMS, torch::Restr
 template <typename ScalarType, int32_t DIMS>
 using TorchAcc = torch::TensorAccessor<ScalarType, DIMS>;
 
-
 /// @brief Shorthand for fvdb::PackedJaggedAccessor32 with RestrictPtrTraits
 /// @tparam ScalarType The type of the tensor
 /// @tparam DIMS The number of dimensions of the tensor
 template <typename ScalarType, int32_t DIMS>
 using JaggedRAcc32 = fvdb::PackedJaggedAccessor32<ScalarType, DIMS, torch::RestrictPtrTraits>;
 
-
 /// @brief Shorthand for fvdb::JaggedAccessor
 /// @tparam ScalarType The type of the tensor
 /// @tparam DIMS The number of dimensions of the tensor
 template <typename ScalarType, int32_t DIMS>
 using JaggedAcc = fvdb::JaggedAccessor<ScalarType, DIMS>;
 
-
-/// @brief Get the number of blocks for a CUDA kernel launch given the number of elements and the maximum number of threads per block
+/// @brief Get the number of blocks for a CUDA kernel launch given the number of elements and the
+/// maximum number of threads per block
 /// @param N The number of elements to parallelize over
 /// @param maxThreadsPerBlock The maximum number of threads per block
 /// @return The number of blocks for a CUDA kernel launch
-static int GET_BLOCKS(const int64_t N, const int64_t maxThreadsPerBlock) {
+static int
+GET_BLOCKS(const int64_t N, const int64_t maxThreadsPerBlock) {
     if (N <= 0) {
         return 0;
     }
@@ -67,11 +68,12 @@ static int GET_BLOCKS(const int64_t N, const int64_t maxThreadsPerBlock) {
     return static_cast<int>(block_num);
 }
 
-
 namespace _private {
 
 template <typename GridType, typename Func, typename... Args>
-__global__ void forEachLeafCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid, int32_t channelsPerLeaf, Func func, Args... args) {
+__global__ void
+forEachLeafCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid, int32_t channelsPerLeaf,
+                      Func func, Args... args) {
     const uint64_t leafChannelIdx = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
     if (leafChannelIdx >= static_cast<uint64_t>(grid.totalLeaves()) * channelsPerLeaf) {
         return;
@@ -81,50 +83,53 @@ __global__ void forEachLeafCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<Grid
     const int32_t channelIdx = leafChannelIdx % channelsPerLeaf;
 
     const fvdb::JIdxType batchIdx = grid.leafBatchIndex(cumLeafIdx);
-    const int64_t leafIdx = cumLeafIdx - grid.leafOffset(batchIdx);
+    const int64_t        leafIdx  = cumLeafIdx - grid.leafOffset(batchIdx);
 
     func(batchIdx, leafIdx, channelIdx, grid, args...);
 }
 
-
 template <typename GridType, typename Func, typename... Args>
-__global__ void forEachLeafSingleGridCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> batchAccessor, int32_t channelsPerLeaf, int32_t bidx, Func func, Args... args) {
-    const typename nanovdb::NanoGrid<GridType>* gpuGrid = batchAccessor.grid(bidx);
+__global__ void
+forEachLeafSingleGridCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> batchAccessor,
+                                int32_t channelsPerLeaf, int32_t bidx, Func func, Args... args) {
+    const typename nanovdb::NanoGrid<GridType> *gpuGrid = batchAccessor.grid(bidx);
 
-    const uint64_t leafChannelIdx = (static_cast<uint64_t>(blockIdx.x)* blockDim.x) + threadIdx.x;
+    const uint64_t leafChannelIdx = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
     if (leafChannelIdx >= static_cast<uint64_t>(gpuGrid->tree().nodeCount(0)) * channelsPerLeaf) {
         return;
     }
 
-    const int64_t leafIdx = static_cast<int64_t>(leafChannelIdx / channelsPerLeaf);
+    const int64_t leafIdx    = static_cast<int64_t>(leafChannelIdx / channelsPerLeaf);
     const int32_t channelIdx = static_cast<int32_t>(leafChannelIdx % channelsPerLeaf);
 
     func(gpuGrid, leafIdx, channelIdx, args...);
 }
 
-
 template <typename GridType>
-__global__ void voxelMetaIndexCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor,
-                                         TorchRAcc32<int64_t, 2> metaIndex) {
-    constexpr int32_t VOXELS_PER_LEAF = nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
+__global__ void
+voxelMetaIndexCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor,
+                         TorchRAcc32<int64_t, 2>                         metaIndex) {
+    constexpr int32_t VOXELS_PER_LEAF =
+        nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
     const int64_t lvIdx = ((int64_t)blockIdx.x * (int64_t)blockDim.x) + threadIdx.x;
 
     if (lvIdx >= gridAccessor.totalLeaves() * VOXELS_PER_LEAF) {
         return;
     }
 
-    const int64_t cumLeafIdx = (lvIdx / VOXELS_PER_LEAF);
+    const int64_t cumLeafIdx   = (lvIdx / VOXELS_PER_LEAF);
     const int64_t leafVoxelIdx = lvIdx % VOXELS_PER_LEAF;
 
     const int64_t batchIdx = gridAccessor.leafBatchIndex(cumLeafIdx);
-    const int64_t leafIdx = cumLeafIdx - gridAccessor.leafOffset(batchIdx);
+    const int64_t leafIdx  = cumLeafIdx - gridAccessor.leafOffset(batchIdx);
 
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
 
     if (leaf.isActive(leafVoxelIdx)) {
         const int64_t baseOffset = gridAccessor.voxelOffset(batchIdx);
-        const int64_t idx = baseOffset + (int64_t) leaf.getValue(leafVoxelIdx) - 1;
+        const int64_t idx        = baseOffset + (int64_t)leaf.getValue(leafVoxelIdx) - 1;
 
         metaIndex[idx][0] = batchIdx;
         metaIndex[idx][1] = leafIdx;
@@ -133,29 +138,33 @@ __global__ void voxelMetaIndexCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<G
 }
 
 template <typename GridType, typename Func, typename... Args>
-__global__ void forEachVoxelWithMetaCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid,
-                                               TorchRAcc32<int64_t, 2> metaIndex,
-                                               int64_t channelsPerVoxel, Func func, Args... args) {
+__global__ void
+forEachVoxelWithMetaCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid,
+                               TorchRAcc32<int64_t, 2> metaIndex, int64_t channelsPerVoxel,
+                               Func func, Args... args) {
     const int64_t vcIdx = ((int64_t)blockIdx.x * (int64_t)blockDim.x) + threadIdx.x;
 
     if (vcIdx >= grid.totalVoxels() * channelsPerVoxel) {
         return;
     }
 
-    const int64_t voxelIdx = vcIdx / channelsPerVoxel;
+    const int64_t voxelIdx   = vcIdx / channelsPerVoxel;
     const int64_t channelIdx = vcIdx % channelsPerVoxel;
 
-    const int64_t batchIdx = metaIndex[voxelIdx][0];
-    const int64_t leafIdx = metaIndex[voxelIdx][1];
+    const int64_t batchIdx     = metaIndex[voxelIdx][0];
+    const int64_t leafIdx      = metaIndex[voxelIdx][1];
     const int64_t leafVoxelIdx = metaIndex[voxelIdx][2];
 
     func(batchIdx, leafIdx, leafVoxelIdx, channelIdx, grid, args...);
 }
 
 template <typename GridType, typename Func, typename... Args>
-__global__ void forEachVoxelCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid, int64_t channelsPerVoxel, Func func, Args... args) {
-    constexpr uint64_t VOXELS_PER_LEAF = nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
-    const uint64_t lvcIdx = (static_cast<uint64_t>(blockIdx.x)* blockDim.x) + threadIdx.x;
+__global__ void
+forEachVoxelCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<GridType> grid,
+                       int64_t channelsPerVoxel, Func func, Args... args) {
+    constexpr uint64_t VOXELS_PER_LEAF =
+        nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
+    const uint64_t lvcIdx = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
     const uint64_t voxelsChannelsPerLeaf = VOXELS_PER_LEAF * channelsPerVoxel;
 
     if (lvcIdx >= static_cast<uint64_t>(grid.totalLeaves()) * voxelsChannelsPerLeaf) {
@@ -163,37 +172,39 @@ __global__ void forEachVoxelCUDAKernel(fvdb::detail::GridBatchImpl::Accessor<Gri
     }
 
     const int64_t cumLeafIdx = static_cast<int64_t>(lvcIdx / voxelsChannelsPerLeaf);
-    const int64_t leafVoxelIdx = static_cast<int64_t>((lvcIdx  - cumLeafIdx * voxelsChannelsPerLeaf) / channelsPerVoxel);
-    const int64_t channelIdx = static_cast<int64_t>(lvcIdx  - cumLeafIdx * voxelsChannelsPerLeaf - leafVoxelIdx * channelsPerVoxel);
+    const int64_t leafVoxelIdx =
+        static_cast<int64_t>((lvcIdx - cumLeafIdx * voxelsChannelsPerLeaf) / channelsPerVoxel);
+    const int64_t channelIdx = static_cast<int64_t>(lvcIdx - cumLeafIdx * voxelsChannelsPerLeaf -
+                                                    leafVoxelIdx * channelsPerVoxel);
 
     const fvdb::JIdxType batchIdx = grid.leafBatchIndex(cumLeafIdx);
-    const int64_t leafIdx = cumLeafIdx - grid.leafOffset(batchIdx);
+    const int64_t        leafIdx  = cumLeafIdx - grid.leafOffset(batchIdx);
 
     func(batchIdx, leafIdx, leafVoxelIdx, channelIdx, grid, args...);
 }
 
 template <int32_t NDIMS, typename ScalarT, typename Func, typename... Args>
-__global__ void forEachJaggedElementChannelCUDAKernel(JaggedRAcc32<ScalarT, NDIMS> jaggedAcc,
-                                                      int64_t channelsPerElement,
-                                                      Func func, Args... args) {
-    const uint64_t idx = (static_cast<uint64_t>(blockIdx.x)* blockDim.x) + threadIdx.x;
-    const int64_t numElements = jaggedAcc.elementCount();
+__global__ void
+forEachJaggedElementChannelCUDAKernel(JaggedRAcc32<ScalarT, NDIMS> jaggedAcc,
+                                      int64_t channelsPerElement, Func func, Args... args) {
+    const uint64_t idx         = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
+    const int64_t  numElements = jaggedAcc.elementCount();
     if (idx >= static_cast<uint64_t>(numElements) * channelsPerElement) {
         return;
     }
-    const int64_t elementIdx = static_cast<int64_t>(idx / channelsPerElement);
-    const fvdb::JIdxType batchIdx = jaggedAcc.batchIdx(elementIdx);
-    const int64_t channelIdx = static_cast<int64_t>(idx % channelsPerElement);
+    const int64_t        elementIdx = static_cast<int64_t>(idx / channelsPerElement);
+    const fvdb::JIdxType batchIdx   = jaggedAcc.batchIdx(elementIdx);
+    const int64_t        channelIdx = static_cast<int64_t>(idx % channelsPerElement);
 
     func(batchIdx, elementIdx, channelIdx, jaggedAcc, args...);
 }
 
 template <int32_t NDIMS, typename ScalarT, typename Func, typename... Args>
-__global__ void forEachTensorElementChannelCUDAKernel(TorchRAcc32<ScalarT, NDIMS> tensorAcc,
-                                                      int64_t channelsPerElement,
-                                                      Func func, Args... args) {
-    const uint64_t idx = (static_cast<uint64_t>(blockIdx.x)* blockDim.x) + threadIdx.x;
-    const int64_t numElements = tensorAcc.size(0);
+__global__ void
+forEachTensorElementChannelCUDAKernel(TorchRAcc32<ScalarT, NDIMS> tensorAcc,
+                                      int64_t channelsPerElement, Func func, Args... args) {
+    const uint64_t idx         = (static_cast<uint64_t>(blockIdx.x) * blockDim.x) + threadIdx.x;
+    const int64_t  numElements = tensorAcc.size(0);
     if (idx >= static_cast<uint64_t>(numElements) * channelsPerElement) {
         return;
     }
@@ -206,12 +217,15 @@ __global__ void forEachTensorElementChannelCUDAKernel(TorchRAcc32<ScalarT, NDIMS
 /// @brief Per-leaf callback function for counting the number of enabled voxels in each leaf used by
 ///        countUnmaskedPerLeafShiftedByOne
 template <typename GridType, template <typename T, int32_t D> typename TorchAccessor>
-__hostdev__ inline void countEnabledPerLeafShiftByOneLeafCallback(int32_t batchIdx, int32_t leafIdx,
-                                                                  fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor,
-                                                                  TorchAccessor<int32_t, 1> outUnmaskedPerLeafAcc) {
-    const nanovdb::NanoGrid<GridType>* grid = gridAccessor.grid(batchIdx);
-    const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
-    const int64_t numUnmasked = static_cast<int64_t>(leaf.template get<fvdb::TotalUnmaskedPerLeaf<GridType>>(1111));
+__hostdev__ inline void
+countEnabledPerLeafShiftByOneLeafCallback(
+    int32_t batchIdx, int32_t leafIdx, fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor,
+    TorchAccessor<int32_t, 1> outUnmaskedPerLeafAcc) {
+    const nanovdb::NanoGrid<GridType>                        *grid = gridAccessor.grid(batchIdx);
+    const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+        grid->tree().template getFirstNode<0>()[leafIdx];
+    const int64_t numUnmasked =
+        static_cast<int64_t>(leaf.template get<fvdb::TotalUnmaskedPerLeaf<GridType>>(1111));
     outUnmaskedPerLeafAcc[gridAccessor.leafOffset(batchIdx) + leafIdx + 1] = numUnmasked;
 }
 
@@ -222,14 +236,16 @@ __hostdev__ inline void countEnabledPerLeafShiftByOneLeafCallback(int32_t batchI
 template <c10::DeviceType DeviceTag, typename T, int N, typename IndexType = int32_t>
 struct TensorAccessorExtractor {
     using AccType = torch::TensorAccessor<T, N>;
-    AccType get(const torch::Tensor& tensor) {
+    AccType
+    get(const torch::Tensor &tensor) {
         return tensor.accessor<T, N>();
     }
 };
 template <typename T, int N, typename IndexType>
 struct TensorAccessorExtractor<torch::kCUDA, T, N, IndexType> {
     using AccType = torch::GenericPackedTensorAccessor<T, N, torch::RestrictPtrTraits, IndexType>;
-    AccType get(const torch::Tensor& tensor) {
+    AccType
+    get(const torch::Tensor &tensor) {
         return tensor.generic_packed_accessor<T, N, torch::RestrictPtrTraits, IndexType>();
     }
 };
@@ -237,17 +253,17 @@ struct TensorAccessorExtractor<torch::kCUDA, T, N, IndexType> {
 /// @brief Helper class to extract the correct GridBatchImpl::Accessor type for a given device
 /// @tparam DeviceTag The device tag (torch::kCUDA or torch::kCPU)
 /// @tparam T The GridType of the grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
-template <c10::DeviceType DeviceTag, typename T>
-struct GridBatchAccessorExtractor {
+template <c10::DeviceType DeviceTag, typename T> struct GridBatchAccessorExtractor {
     using AccType = fvdb::detail::GridBatchImpl::Accessor<T>;
-    AccType get(const fvdb::detail::GridBatchImpl& batchHdl) {
+    AccType
+    get(const fvdb::detail::GridBatchImpl &batchHdl) {
         return batchHdl.hostAccessor<T>();
     }
 };
-template <typename T>
-struct GridBatchAccessorExtractor<torch::kCUDA, T> {
+template <typename T> struct GridBatchAccessorExtractor<torch::kCUDA, T> {
     using AccType = fvdb::detail::GridBatchImpl::Accessor<T>;
-    AccType get(const fvdb::detail::GridBatchImpl& batchHdl) {
+    AccType
+    get(const fvdb::detail::GridBatchImpl &batchHdl) {
         return batchHdl.deviceAccessor<T>();
     }
 };
@@ -256,23 +272,22 @@ struct GridBatchAccessorExtractor<torch::kCUDA, T> {
 /// @tparam DeviceTag The device tag (torch::kCUDA or torch::kCPU)
 /// @tparam T The scalar type of the jagged tensor
 /// @tparam N The number of dimensions of the jagged data tensor
-template <c10::DeviceType DeviceTag, typename T, int N>
-struct JaggedAccessorExtractor {
+template <c10::DeviceType DeviceTag, typename T, int N> struct JaggedAccessorExtractor {
     using AccType = fvdb::JaggedAcc<T, N>;
-    AccType get(const fvdb::JaggedTensor& tensor) {
+    AccType
+    get(const fvdb::JaggedTensor &tensor) {
         return tensor.accessor<T, N>();
     }
 };
-template <typename T, int N>
-struct JaggedAccessorExtractor<torch::kCUDA, T, N> {
+template <typename T, int N> struct JaggedAccessorExtractor<torch::kCUDA, T, N> {
     using AccType = fvdb::JaggedRAcc32<T, N>;
-    AccType get(const fvdb::JaggedTensor& tensor) {
+    AccType
+    get(const fvdb::JaggedTensor &tensor) {
         return tensor.packed_accessor32<T, N, torch::RestrictPtrTraits>();
     }
 };
 
-}  // namespace _private
-
+} // namespace _private
 
 /// @brief Get an accessor for the given tensor with scalar type T and N dimensions
 /// @tparam DeviceTag The device tag to use for the accessor (either torch::kCUDA or torch::kCPU)
@@ -282,22 +297,22 @@ struct JaggedAccessorExtractor<torch::kCUDA, T, N> {
 /// @param tensor The tensor to get an accessor for
 /// @return A tensor accessor (either torch::TensorAccessor or torch::PackedTensorAccessor32)
 template <c10::DeviceType DeviceTag, typename T, size_t N, typename IndexType = int32_t>
-typename _private::TensorAccessorExtractor<DeviceTag, T, N, IndexType>::AccType tensorAccessor(const torch::Tensor& tensor) {
-    return  _private::TensorAccessorExtractor<DeviceTag, T, N, IndexType>().get(tensor);
+typename _private::TensorAccessorExtractor<DeviceTag, T, N, IndexType>::AccType
+tensorAccessor(const torch::Tensor &tensor) {
+    return _private::TensorAccessorExtractor<DeviceTag, T, N, IndexType>().get(tensor);
 }
 
-
 /// @brief Get an accessor for the given batched grid handle with scalar type T
 /// @tparam DeviceTag The device tag to use for the accessor (either torch::kCUDA or torch::kCPU)
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
 /// @param batchHdl The batched grid handle to get an accessor for
 /// @return A fvdb::detail::GridBatchImpl::Accessor of the given type on the appropriate device
 template <c10::DeviceType DeviceTag, typename GridType>
-typename _private::GridBatchAccessorExtractor<DeviceTag, GridType>::AccType gridBatchAccessor(const fvdb::detail::GridBatchImpl& batchHdl) {
-    return  _private::GridBatchAccessorExtractor<DeviceTag, GridType>().get(batchHdl);
+typename _private::GridBatchAccessorExtractor<DeviceTag, GridType>::AccType
+gridBatchAccessor(const fvdb::detail::GridBatchImpl &batchHdl) {
+    return _private::GridBatchAccessorExtractor<DeviceTag, GridType>().get(batchHdl);
 }
 
-
 /// @brief Get an accessor for the given jagged tensor with scalar type T and N dimensions
 /// @tparam DeviceTag The device tag to use for the accessor (either torch::kCUDA or torch::kCPU)
 /// @tparam T The scalar type of the JaggedTensor
@@ -305,21 +320,24 @@ typename _private::GridBatchAccessorExtractor<DeviceTag, GridType>::AccType grid
 /// @param jaggedTensor The JaggedTensor to get an accessor for
 /// @return A JaggedTensor accessor (either JaggedAccessor or PackedJaggedAccessor32)
 template <c10::DeviceType DeviceTag, typename T, size_t N>
-typename _private::JaggedAccessorExtractor<DeviceTag, T, N>::AccType jaggedAccessor(const fvdb::JaggedTensor& jaggedTensor) {
-    return  _private::JaggedAccessorExtractor<DeviceTag, T, N>().get(jaggedTensor);
+typename _private::JaggedAccessorExtractor<DeviceTag, T, N>::AccType
+jaggedAccessor(const fvdb::JaggedTensor &jaggedTensor) {
+    return _private::JaggedAccessorExtractor<DeviceTag, T, N>().get(jaggedTensor);
 }
 
-
 /// @brief Run the given function on each leaf in the grid batch in parallel on the GPU.
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t lidx, int32_t cidx, fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
+///            void(int32_t bidx, int32_t lidx, int32_t cidx,
+///            fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
 ///        Where:
 ///            - bidx is the batch index of the current leaf
 ///            - lidx is the index of the leaf within the bidx^th grid in the batch
 ///            - cidx is the channel index
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
-/// @tparam Func The type of the callback function to run on each leaf. It must be a callable of the form
-///         void(int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
+/// @tparam Func The type of the callback function to run on each leaf. It must be a callable of the
+/// form
+///         void(int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>,
+///         Args...)
 /// @tparam Args... The types of any extra arguments to pass to the callback function
 ///
 /// @param numThreads The number of threads per block to use
@@ -328,21 +346,24 @@ typename _private::JaggedAccessorExtractor<DeviceTag, T, N>::AccType jaggedAcces
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-void forEachLeafCUDA(int64_t numThreads, int64_t numChannels, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+void
+forEachLeafCUDA(int64_t numThreads, int64_t numChannels,
+                const fvdb::detail::GridBatchImpl &batchHdl, Func func, Args... args) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "Grid batch must be on a CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "Grid batch device must have an index");
     c10::cuda::CUDAGuard deviceGuard(batchHdl.device());
-    const int64_t numBlocks = GET_BLOCKS(batchHdl.totalLeaves() * numChannels, numThreads);
-    TORCH_INTERNAL_ASSERT(numBlocks < (int64_t) (4294967295), "Too many blocks");
+    const int64_t        numBlocks = GET_BLOCKS(batchHdl.totalLeaves() * numChannels, numThreads);
+    TORCH_INTERNAL_ASSERT(numBlocks < (int64_t)(4294967295), "Too many blocks");
     if (numBlocks > 0) {
         auto batchAccessor = batchHdl.deviceAccessor<GridType>();
-        _private::forEachLeafCUDAKernel<<<numBlocks, numThreads>>>(batchAccessor, numChannels, func, args...);
+        _private::forEachLeafCUDAKernel<<<numBlocks, numThreads>>>(batchAccessor, numChannels, func,
+                                                                   args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 }
 
-
-/// @brief Run the given function on each leaf in the specified grid (at index batchIdx) in the batch in parallel on the GPU.
+/// @brief Run the given function on each leaf in the specified grid (at index batchIdx) in the
+/// batch in parallel on the GPU.
 ///        The callback has the form:
 ///            void(const nanovdb::NanoGrid<GridType>*, int32_t lidx, int32_t cidx, Args...)
 ///        Where:
@@ -360,34 +381,41 @@ void forEachLeafCUDA(int64_t numThreads, int64_t numChannels, const fvdb::detail
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-void forEachLeafInOneGridCUDA(int64_t numThreads, int64_t numChannels, int64_t batchIdx, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+void
+forEachLeafInOneGridCUDA(int64_t numThreads, int64_t numChannels, int64_t batchIdx,
+                         const fvdb::detail::GridBatchImpl &batchHdl, Func func, Args... args) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "Grid batch must be on a CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "Grid batch device must have an index");
     c10::cuda::CUDAGuard deviceGuard(batchHdl.device());
     TORCH_CHECK(batchIdx >= 0 && batchIdx < batchHdl.batchSize(), "Batch index out of range");
     const int64_t numBlocks = GET_BLOCKS(batchHdl.numLeaves(batchIdx) * numChannels, numThreads);
-    TORCH_INTERNAL_ASSERT(numBlocks < (int64_t) (4294967295), "Too many blocks");
+    TORCH_INTERNAL_ASSERT(numBlocks < (int64_t)(4294967295), "Too many blocks");
     if (numBlocks > 0) {
         auto batchAccessor = batchHdl.deviceAccessor<GridType>();
-        _private::forEachLeafSingleGridCUDAKernel<<<numBlocks, numThreads>>>(batchAccessor, numChannels, batchIdx, func, args...);
+        _private::forEachLeafSingleGridCUDAKernel<<<numBlocks, numThreads>>>(
+            batchAccessor, numChannels, batchIdx, func, args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 }
 
-
 /// @brief Run the given function on each voxel in the grid batch in parallel on the GPU.
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
+///            void(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+///            fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
 ///         Where:
 ///             - bidx is the batch index of the current voxel
-///             - lidx is the index of the leaf containing the voxelwithin the bidx^th grid in the batch
+///             - lidx is the index of the leaf containing the voxelwithin the bidx^th grid in the
+///             batch
 ///             - vidx is the index of the voxel within the leaf
 ///             - cidx is the channel index
-/// @note This function will run on inactive voxels within a leaf so you need to chekc if the voxel is active
+/// @note This function will run on inactive voxels within a leaf so you need to chekc if the voxel
+/// is active
 ///
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
-/// @tparam Func The type of the callback function to run on each voxel. It must be a callable of the form
-///         void(int32_t, int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
+/// @tparam Func The type of the callback function to run on each voxel. It must be a callable of
+/// the form
+///         void(int32_t, int32_t, int32_t, int32_t,
+///         fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
 /// @tparam Args... The types of any extra arguments to pass to the callback function
 ///
 /// @param numThreads The number of threads per block to use
@@ -396,58 +424,62 @@ void forEachLeafInOneGridCUDA(int64_t numThreads, int64_t numChannels, int64_t b
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-void forEachVoxelCUDA(int64_t numThreads, int64_t numChannels, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+void
+forEachVoxelCUDA(int64_t numThreads, int64_t numChannels,
+                 const fvdb::detail::GridBatchImpl &batchHdl, Func func, Args... args) {
     TORCH_CHECK(batchHdl.device().is_cuda(), "Grid batch must be on a CUDA device");
     TORCH_CHECK(batchHdl.device().has_index(), "Grid batch device must have an index");
     c10::cuda::CUDAGuard deviceGuard(batchHdl.device());
 
-    const int64_t VOXELS_PER_LEAF = nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
+    const int64_t VOXELS_PER_LEAF =
+        nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES;
     const int64_t numLeaves = batchHdl.totalLeaves();
     const int64_t numVoxels = batchHdl.totalVoxels();
 
-    if (numVoxels == 0) return;
+    if (numVoxels == 0)
+        return;
     auto batchAccessor = batchHdl.deviceAccessor<GridType>();
 
     if (fvdb::Config::global().ultraSparseAccelerationEnabled()) {
-        torch::Tensor metaIndex = torch::empty({numVoxels, 3}, torch::dtype(torch::kInt64).device(batchHdl.device()));
+        torch::Tensor metaIndex =
+            torch::empty({ numVoxels, 3 }, torch::dtype(torch::kInt64).device(batchHdl.device()));
         auto metaIndexAcc = metaIndex.packed_accessor32<int64_t, 2, torch::RestrictPtrTraits>();
 
         const int64_t numMetaBlocks = GET_BLOCKS(numLeaves * VOXELS_PER_LEAF, 128);
-        _private::voxelMetaIndexCUDAKernel<<<numMetaBlocks, 128>>>(
-            batchAccessor,
-            metaIndexAcc
-        );
+        _private::voxelMetaIndexCUDAKernel<<<numMetaBlocks, 128>>>(batchAccessor, metaIndexAcc);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
 
         const int64_t numBlocks = GET_BLOCKS(numVoxels * numChannels, numThreads);
-        TORCH_INTERNAL_ASSERT(numBlocks < (int64_t) (4294967295), "Too many blocks in forEachVoxelCUDA");
+        TORCH_INTERNAL_ASSERT(numBlocks < (int64_t)(4294967295),
+                              "Too many blocks in forEachVoxelCUDA");
         _private::forEachVoxelWithMetaCUDAKernel<<<numBlocks, numThreads>>>(
-            batchAccessor,
-            metaIndexAcc, numChannels, func, args...);
+            batchAccessor, metaIndexAcc, numChannels, func, args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
 
     } else {
-
         const int64_t numBlocks = GET_BLOCKS(numLeaves * VOXELS_PER_LEAF * numChannels, numThreads);
-        TORCH_INTERNAL_ASSERT(numBlocks < (int64_t) (4294967295), "Too many blocks in forEachVoxelCUDA");
+        TORCH_INTERNAL_ASSERT(numBlocks < (int64_t)(4294967295),
+                              "Too many blocks in forEachVoxelCUDA");
 
-        _private::forEachVoxelCUDAKernel<<<numBlocks, numThreads>>>(batchAccessor, numChannels, func, args...);
+        _private::forEachVoxelCUDAKernel<<<numBlocks, numThreads>>>(batchAccessor, numChannels,
+                                                                    func, args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
-
 }
 
-
 /// @brief Run the given function on each leaf in the grid batch on the CPU.
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t lidx, int32_t cidx, fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
+///            void(int32_t bidx, int32_t lidx, int32_t cidx,
+///            fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
 ///        Where:
 ///            - bidx is the batch index of the current leaf
 ///            - lidx is the index of the leaf within the bidx^th grid in the batch
 ///            - cidx is the channel index
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
-/// @tparam Func The type of the callback function to run on each leaf. It must be a callable of the form
-///         void(int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
+/// @tparam Func The type of the callback function to run on each leaf. It must be a callable of the
+/// form
+///         void(int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>,
+///         Args...)
 /// @tparam Args... The types of any extra arguments to pass to the callback function
 ///
 /// @param numChannels The number of channels per item in each leaf being parallelized over
@@ -455,24 +487,27 @@ void forEachVoxelCUDA(int64_t numThreads, int64_t numChannels, const fvdb::detai
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-__host__ void forEachLeafCPU(int64_t channelsPerLeaf, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+__host__ void
+forEachLeafCPU(int64_t channelsPerLeaf, const fvdb::detail::GridBatchImpl &batchHdl, Func func,
+               Args... args) {
     TORCH_CHECK(batchHdl.device().is_cpu(), "Grid batch must be on the CPU");
     auto batchAccessor = batchHdl.hostAccessor<GridType>();
 
-    for (uint64_t leafChannelIdx = 0; leafChannelIdx < static_cast<uint64_t>(batchAccessor.totalLeaves()) * channelsPerLeaf; leafChannelIdx += 1) {
-
+    for (uint64_t leafChannelIdx = 0;
+         leafChannelIdx < static_cast<uint64_t>(batchAccessor.totalLeaves()) * channelsPerLeaf;
+         leafChannelIdx += 1) {
         const int64_t cumLeafIdx = static_cast<int64_t>(leafChannelIdx / channelsPerLeaf);
         const int64_t channelIdx = static_cast<int64_t>(leafChannelIdx % channelsPerLeaf);
 
         const fvdb::JIdxType batchIdx = batchAccessor.leafBatchIndex(cumLeafIdx);
-        const int64_t leafIdx = cumLeafIdx - batchAccessor.leafOffset(batchIdx);
+        const int64_t        leafIdx  = cumLeafIdx - batchAccessor.leafOffset(batchIdx);
 
         func(batchIdx, leafIdx, channelIdx, batchAccessor, args...);
     }
 }
 
-
-/// @brief Run the given function on each leaf in the specified grid (at index batchIdx) in the batch on the CPU.
+/// @brief Run the given function on each leaf in the specified grid (at index batchIdx) in the
+/// batch on the CPU.
 ///        The callback has the form:
 ///            void(const nanovdb::NanoGrid<GridType>*, int32_t lidx, int32_t cidx, Args...)
 ///        Where:
@@ -490,36 +525,43 @@ __host__ void forEachLeafCPU(int64_t channelsPerLeaf, const fvdb::detail::GridBa
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-void forEachLeafInOneGridCPU(int64_t numChannels, int64_t batchIdx, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+void
+forEachLeafInOneGridCPU(int64_t numChannels, int64_t batchIdx,
+                        const fvdb::detail::GridBatchImpl &batchHdl, Func func, Args... args) {
     TORCH_CHECK(batchHdl.device().is_cpu(), "Grid batch must be on the CPU");
     TORCH_CHECK(batchIdx >= 0 && batchIdx < batchHdl.batchSize(), "Batch index out of range");
     auto batchAccessor = batchHdl.hostAccessor<GridType>();
 
-    const typename nanovdb::NanoGrid<GridType>* cpuGrid = batchAccessor.grid(batchIdx);
-
-    for (uint64_t leafChannelIdx = 0; leafChannelIdx < static_cast<uint64_t>(cpuGrid->tree().nodeCount(0)) * numChannels; leafChannelIdx += 1) {
+    const typename nanovdb::NanoGrid<GridType> *cpuGrid = batchAccessor.grid(batchIdx);
 
-        const int64_t leafIdx = static_cast<int64_t>(leafChannelIdx / numChannels);
+    for (uint64_t leafChannelIdx = 0;
+         leafChannelIdx < static_cast<uint64_t>(cpuGrid->tree().nodeCount(0)) * numChannels;
+         leafChannelIdx += 1) {
+        const int64_t leafIdx    = static_cast<int64_t>(leafChannelIdx / numChannels);
         const int64_t channelIdx = static_cast<int64_t>(leafChannelIdx % numChannels);
 
         func(cpuGrid, leafIdx, channelIdx, args...);
     }
 }
 
-
 /// @brief Run the given function on each voxel in the grid batch on the CPU.
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx, fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
+///            void(int32_t bidx, int32_t lidx, int32_t vidx, int32_t cidx,
+///            fvdb::detail::GridBatchImpl::Accessor<GridType> batchAcc, Args...)
 ///         Where:
 ///             - bidx is the batch index of the current voxel
-///             - lidx is the index of the leaf containing the voxelwithin the bidx^th grid in the batch
+///             - lidx is the index of the leaf containing the voxelwithin the bidx^th grid in the
+///             batch
 ///             - vidx is the index of the voxel within the leaf
 ///             - cidx is the channel index
-/// @note This function will run on inactive voxels within a leaf so you need to chekc if the voxel is active
+/// @note This function will run on inactive voxels within a leaf so you need to chekc if the voxel
+/// is active
 ///
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
-/// @tparam Func The type of the callback function to run on each voxel. It must be a callable of the form
-///         void(int32_t, int32_t, int32_t, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
+/// @tparam Func The type of the callback function to run on each voxel. It must be a callable of
+/// the form
+///         void(int32_t, int32_t, int32_t, int32_t,
+///         fvdb::detail::GridBatchImpl::Accessor<GridType>, Args...)
 /// @tparam Args... The types of any extra arguments to pass to the callback function
 ///
 /// @param numChannels The number of channels per item in each leaf being parallelized over
@@ -527,14 +569,18 @@ void forEachLeafInOneGridCPU(int64_t numChannels, int64_t batchIdx, const fvdb::
 /// @param func The callback function to run on each leaf
 /// @param args Any extra arguments to pass to the callback function
 template <typename GridType, typename Func, typename... Args>
-__host__ void forEachVoxelCPU(int64_t numChannels, const fvdb::detail::GridBatchImpl& batchHdl, Func func, Args... args) {
+__host__ void
+forEachVoxelCPU(int64_t numChannels, const fvdb::detail::GridBatchImpl &batchHdl, Func func,
+                Args... args) {
     TORCH_CHECK(batchHdl.device().is_cpu(), "Grid batch must be on the CPU");
-    constexpr int64_t VOXELS_PER_LEAF = static_cast<int64_t>(nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES);
+    constexpr int64_t VOXELS_PER_LEAF =
+        static_cast<int64_t>(nanovdb::NanoTree<nanovdb::ValueOnIndex>::LeafNodeType::NUM_VALUES);
     auto batchAccessor = batchHdl.hostAccessor<GridType>();
     for (fvdb::JIdxType batchIdx = 0; batchIdx < batchAccessor.batchSize(); batchIdx += 1) {
-        const nanovdb::NanoGrid<GridType>* grid = batchAccessor.grid(batchIdx);
+        const nanovdb::NanoGrid<GridType> *grid = batchAccessor.grid(batchIdx);
         for (int64_t leafIdx = 0; leafIdx < grid->tree().nodeCount(0); leafIdx += 1) {
-            const typename nanovdb::NanoGrid<GridType>::LeafNodeType& leaf = grid->tree().template getFirstNode<0>()[leafIdx];
+            const typename nanovdb::NanoGrid<GridType>::LeafNodeType &leaf =
+                grid->tree().template getFirstNode<0>()[leafIdx];
             for (int64_t voxIdx = 0; voxIdx < VOXELS_PER_LEAF; voxIdx += 1) {
                 if (leaf.isActive(voxIdx)) {
                     for (int64_t channelIdx = 0; channelIdx < numChannels; channelIdx += 1) {
@@ -546,10 +592,10 @@ __host__ void forEachVoxelCPU(int64_t numChannels, const fvdb::detail::GridBatch
     }
 }
 
-
 /// @brief Run the given function on each element in the jagged tensor on the GPU
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t eidx, int32_t cidx, fvdb::JaggedAccessor<ScalarT, NDIMS> jaggedAcc, Args...)
+///            void(int32_t bidx, int32_t eidx, int32_t cidx, fvdb::JaggedAccessor<ScalarT, NDIMS>
+///            jaggedAcc, Args...)
 ///         Where:
 ///             - bidx is the batch index of the current element
 ///             - eidx is the offset of the current element in the data tensor of the jagged tensor
@@ -560,28 +606,33 @@ __host__ void forEachVoxelCPU(int64_t numChannels, const fvdb::detail::GridBatch
 /// @tparam ...Args The types of any extra arguments to pass to the callback function
 /// @tparam NDIMS The number of data dimensions in the jagged tensor
 /// @param numThreads The number of threads to use per block
-/// @param numChannels The number of channels per item in each jagged element being parallelized over
+/// @param numChannels The number of channels per item in each jagged element being parallelized
+/// over
 /// @param jaggedTensor The jagged tensor to parallelize over
 /// @param func The callback function to run on each element
 /// @param ...args Any extra arguments to pass to the callback function
 template <typename ScalarT, int32_t NDIMS, typename Func, typename... Args>
-void forEachJaggedElementChannelCUDA(int64_t numThreads, int64_t numChannels, const JaggedTensor& jaggedTensor, Func func, Args... args) {
+void
+forEachJaggedElementChannelCUDA(int64_t numThreads, int64_t numChannels,
+                                const JaggedTensor &jaggedTensor, Func func, Args... args) {
     TORCH_CHECK(jaggedTensor.device().is_cuda(), "JaggedTensor must be on a CUDA device");
     TORCH_CHECK(jaggedTensor.device().has_index(), "JaggedTensor device must have an index");
     c10::cuda::CUDAGuard deviceGuard(jaggedTensor.device());
-    const int64_t numElements = jaggedTensor.element_count();
-    const int64_t numBlocks = GET_BLOCKS(numElements * numChannels, numThreads);
+    const int64_t        numElements = jaggedTensor.element_count();
+    const int64_t        numBlocks   = GET_BLOCKS(numElements * numChannels, numThreads);
     if (numBlocks > 0) {
-        _private::forEachJaggedElementChannelCUDAKernel<NDIMS, ScalarT, Func, Args...><<<numBlocks, numThreads>>>(
-                jaggedTensor.packed_accessor32<ScalarT, NDIMS, torch::RestrictPtrTraits>(), numChannels, func, args...);
+        _private::forEachJaggedElementChannelCUDAKernel<NDIMS, ScalarT, Func, Args...>
+            <<<numBlocks, numThreads>>>(
+                jaggedTensor.packed_accessor32<ScalarT, NDIMS, torch::RestrictPtrTraits>(),
+                numChannels, func, args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 }
 
-
 /// @brief Run the given function on each element in the tensor (first dimension) on the GPU
 ///        The callback has the form:
-///            void(int32_t eidx, int32_t cidx, fvdb::TensorAccessor<ScalarT, NDIMS> tensorAcc, Args...)
+///            void(int32_t eidx, int32_t cidx, fvdb::TensorAccessor<ScalarT, NDIMS> tensorAcc,
+///            Args...)
 ///         Where:
 ///             - eidx is the batch id of the tensor
 ///             - cidx is the channel index
@@ -591,28 +642,33 @@ void forEachJaggedElementChannelCUDA(int64_t numThreads, int64_t numChannels, co
 /// @tparam ...Args The types of any extra arguments to pass to the callback function
 /// @tparam NDIMS The number of data dimensions in the tensor
 /// @param numThreads The number of threads to use per block
-/// @param numChannels The number of channels per item in each tensor element being parallelized over
+/// @param numChannels The number of channels per item in each tensor element being parallelized
+/// over
 /// @param tensor The tensor to parallelize over
 /// @param func The callback function to run on each element
 /// @param ...args Any extra arguments to pass to the callback function
 template <typename ScalarT, int32_t NDIMS, typename Func, typename... Args>
-void forEachTensorElementChannelCUDA(int64_t numThreads, int64_t numChannels, const torch::Tensor& tensor, Func func, Args... args) {
+void
+forEachTensorElementChannelCUDA(int64_t numThreads, int64_t numChannels,
+                                const torch::Tensor &tensor, Func func, Args... args) {
     TORCH_CHECK(tensor.device().is_cuda(), "Tensor must be on a CUDA device");
     TORCH_CHECK(tensor.device().has_index(), "Tensor device must have an index");
     c10::cuda::CUDAGuard deviceGuard(tensor.device());
-    const int64_t numElements = tensor.size(0);
-    const int64_t numBlocks = GET_BLOCKS(numElements * numChannels, numThreads);
+    const int64_t        numElements = tensor.size(0);
+    const int64_t        numBlocks   = GET_BLOCKS(numElements * numChannels, numThreads);
     if (numBlocks > 0) {
-        _private::forEachTensorElementChannelCUDAKernel<NDIMS, ScalarT, Func, Args...><<<numBlocks, numThreads>>>(
-                tensor.packed_accessor32<ScalarT, NDIMS, torch::RestrictPtrTraits>(), numChannels, func, args...);
+        _private::forEachTensorElementChannelCUDAKernel<NDIMS, ScalarT, Func, Args...>
+            <<<numBlocks, numThreads>>>(
+                tensor.packed_accessor32<ScalarT, NDIMS, torch::RestrictPtrTraits>(), numChannels,
+                func, args...);
         C10_CUDA_KERNEL_LAUNCH_CHECK();
     }
 }
 
-
 /// @brief Run the given function on each element in the jagged tensor on the CPU
 ///        The callback has the form:
-///            void(int32_t bidx, int32_t eidx, int32_t cidx, fvdb::JaggedAccessor<ScalarT, NDIMS> jaggedAcc, Args...)
+///            void(int32_t bidx, int32_t eidx, int32_t cidx, fvdb::JaggedAccessor<ScalarT, NDIMS>
+///            jaggedAcc, Args...)
 ///         Where:
 ///             - bidx is the batch index of the current element
 ///             - eidx is the offset of the current element in the data tensor of the jagged tensor
@@ -623,29 +679,32 @@ void forEachTensorElementChannelCUDA(int64_t numThreads, int64_t numChannels, co
 /// @tparam ...Args The types of any extra arguments to pass to the callback function
 /// @tparam NDIMS The number of data dimensions in the jagged tensor
 /// @param numThreads The number of threads to use per block
-/// @param numChannels The number of channels per item in each jagged element being parallelized over
+/// @param numChannels The number of channels per item in each jagged element being parallelized
+/// over
 /// @param jaggedTensor The jagged tensor to parallelize over
 /// @param func The callback function to run on each element
 /// @param ...args Any extra arguments to pass to the callback function
 template <typename ScalarT, int32_t NDIMS, typename Func, typename... Args>
-void forEachJaggedElementChannelCPU(int64_t numChannels, const JaggedTensor& jaggedTensor, Func func, Args... args) {
+void
+forEachJaggedElementChannelCPU(int64_t numChannels, const JaggedTensor &jaggedTensor, Func func,
+                               Args... args) {
     TORCH_CHECK(jaggedTensor.device().is_cpu(), "JaggedTensor must be on the CPU");
     const int64_t numElements = jaggedTensor.element_count() * numChannels;
-    auto jaggedAcc = jaggedTensor.accessor<ScalarT, NDIMS>();
+    auto          jaggedAcc   = jaggedTensor.accessor<ScalarT, NDIMS>();
 
     for (int64_t idx = 0; idx < numElements; idx += 1) {
-        const int64_t elementIdx = idx / numChannels;
-        const fvdb::JIdxType batchIdx = jaggedAcc.batchIdx(elementIdx);
-        const int64_t channelIdx = idx % numChannels;
+        const int64_t        elementIdx = idx / numChannels;
+        const fvdb::JIdxType batchIdx   = jaggedAcc.batchIdx(elementIdx);
+        const int64_t        channelIdx = idx % numChannels;
 
         func(batchIdx, elementIdx, channelIdx, jaggedAcc, args...);
     }
 }
 
-
 /// @brief Run the given function on each element in the tensor (first dimension) on the CPU
 ///        The callback has the form:
-///            void(int32_t eidx, int32_t cidx, fvdb::TensorAccessor<ScalarT, NDIMS> tensorAcc, Args...)
+///            void(int32_t eidx, int32_t cidx, fvdb::TensorAccessor<ScalarT, NDIMS> tensorAcc,
+///            Args...)
 ///         Where:
 ///             - eidx is the batch id of the tensor
 ///             - cidx is the channel index
@@ -655,15 +714,18 @@ void forEachJaggedElementChannelCPU(int64_t numChannels, const JaggedTensor& jag
 /// @tparam ...Args The types of any extra arguments to pass to the callback function
 /// @tparam NDIMS The number of data dimensions in the tensor
 /// @param numThreads The number of threads to use per block
-/// @param numChannels The number of channels per item in each tensor element being parallelized over
+/// @param numChannels The number of channels per item in each tensor element being parallelized
+/// over
 /// @param tensor The tensor to parallelize over
 /// @param func The callback function to run on each element
 /// @param ...args Any extra arguments to pass to the callback function
 template <typename ScalarT, int32_t NDIMS, typename Func, typename... Args>
-void forEachTensorElementChannelCPU(int64_t numChannels, const torch::Tensor& tensor, Func func, Args... args) {
+void
+forEachTensorElementChannelCPU(int64_t numChannels, const torch::Tensor &tensor, Func func,
+                               Args... args) {
     TORCH_CHECK(tensor.device().is_cpu(), "Tensor must be on the CPU");
     const int64_t numElements = tensor.size(0) * numChannels;
-    auto tensorAcc = tensor.accessor<ScalarT, NDIMS>();
+    auto          tensorAcc   = tensor.accessor<ScalarT, NDIMS>();
 
     for (int64_t idx = 0; idx < numElements; idx += 1) {
         const int64_t elementIdx = idx / numChannels;
@@ -676,47 +738,52 @@ void forEachTensorElementChannelCPU(int64_t numChannels, const torch::Tensor& te
 /// @brief Count the number of unmasked voxels in each leaf node in the grid batch on the GPU
 /// @tparam GridType The type of grid (either nanovdb::ValueOnIndex or nanovdb::ValueOnIndexMask)
 /// @param gridBatch The grid batch
-/// @return A tensor of size [total_leaves + 1] containing the number of unmasked voxels in each leaf (and zero in the first index)
+/// @return A tensor of size [total_leaves + 1] containing the number of unmasked voxels in each
+/// leaf (and zero in the first index)
 ///         i.e. [0, #unmasked_in_leaf_0, #unmasked_in_leaf_1, ... #unmasksed_in_leaf_N]
 template <typename GridType, c10::DeviceType DeviceTag>
-__host__ inline torch::Tensor countEnabledPerLeafShiftedByOne(const fvdb::detail::GridBatchImpl& gridBatch) {
-    auto opts = torch::TensorOptions().dtype(torch::kInt32).device(gridBatch.device());
-    torch::Tensor unmaskedPerLeaf = torch::zeros({gridBatch.totalLeaves() + 1}, opts);
+__host__ inline torch::Tensor
+countEnabledPerLeafShiftedByOne(const fvdb::detail::GridBatchImpl &gridBatch) {
+    auto          opts = torch::TensorOptions().dtype(torch::kInt32).device(gridBatch.device());
+    torch::Tensor unmaskedPerLeaf = torch::zeros({ gridBatch.totalLeaves() + 1 }, opts);
 
     auto outUnmaskedPerLeafAcc = tensorAccessor<DeviceTag, int32_t, 1>(unmaskedPerLeaf);
     if constexpr (DeviceTag == torch::kCUDA) {
-        auto cb = [=] __device__ (int32_t batchIdx, int32_t leafIdx, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor) {
-            _private::countEnabledPerLeafShiftByOneLeafCallback<GridType, TorchRAcc32>(batchIdx, leafIdx, gridAccessor, outUnmaskedPerLeafAcc);
+        auto cb = [=] __device__(int32_t batchIdx, int32_t leafIdx, int32_t,
+                                 fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor) {
+            _private::countEnabledPerLeafShiftByOneLeafCallback<GridType, TorchRAcc32>(
+                batchIdx, leafIdx, gridAccessor, outUnmaskedPerLeafAcc);
         };
         fvdb::forEachLeafCUDA<GridType>(1024, 1, gridBatch, cb);
     } else {
-        auto cb = [=] (int32_t batchIdx, int32_t leafIdx, int32_t, fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor) {
-            _private::countEnabledPerLeafShiftByOneLeafCallback<GridType, TorchAcc>(batchIdx, leafIdx, gridAccessor, outUnmaskedPerLeafAcc);
+        auto cb = [=](int32_t batchIdx, int32_t leafIdx, int32_t,
+                      fvdb::detail::GridBatchImpl::Accessor<GridType> gridAccessor) {
+            _private::countEnabledPerLeafShiftByOneLeafCallback<GridType, TorchAcc>(
+                batchIdx, leafIdx, gridAccessor, outUnmaskedPerLeafAcc);
         };
         fvdb::forEachLeafCPU<GridType>(1, gridBatch, cb);
     }
     return unmaskedPerLeaf;
 }
 
-
-/// @brief A wrapper around a raw device buffer that automatically frees the buffer when it goes out of scope
+/// @brief A wrapper around a raw device buffer that automatically frees the buffer when it goes out
+/// of scope
 /// @tparam T The type of data this buffer points to
-template <typename T>
-struct RAIIRawDeviceBuffer {
-    T* devicePtr = nullptr;
-    size_t bufferSize = 0;
-    cudaStream_t stream = 0;
-
-    RAIIRawDeviceBuffer() = default;
-    RAIIRawDeviceBuffer(const RAIIRawDeviceBuffer&) = delete;
-
-    RAIIRawDeviceBuffer(RAIIRawDeviceBuffer&& other) {
-        devicePtr = other.devicePtr;
-        bufferSize = other.bufferSize;
-        stream = other.stream;
-        other.devicePtr = nullptr;
+template <typename T> struct RAIIRawDeviceBuffer {
+    T           *devicePtr  = nullptr;
+    size_t       bufferSize = 0;
+    cudaStream_t stream     = 0;
+
+    RAIIRawDeviceBuffer()                            = default;
+    RAIIRawDeviceBuffer(const RAIIRawDeviceBuffer &) = delete;
+
+    RAIIRawDeviceBuffer(RAIIRawDeviceBuffer &&other) {
+        devicePtr        = other.devicePtr;
+        bufferSize       = other.bufferSize;
+        stream           = other.stream;
+        other.devicePtr  = nullptr;
         other.bufferSize = 0;
-        other.stream = 0;
+        other.stream     = 0;
     }
 
     /// @brief Create a buffer containing size elements of type T on the specified device
@@ -725,9 +792,10 @@ struct RAIIRawDeviceBuffer {
     RAIIRawDeviceBuffer(size_t size, torch::Device device) {
         TORCH_CHECK(device.has_index(), "Device must specify an index");
         c10::cuda::CUDAGuard deviceGuard(device);
-        stream = at::cuda::getCurrentCUDAStream(device.index()).stream();
+        stream     = at::cuda::getCurrentCUDAStream(device.index()).stream();
         bufferSize = size * sizeof(T);
-        devicePtr = reinterpret_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(bufferSize, stream));
+        devicePtr  = reinterpret_cast<T *>(
+            c10::cuda::CUDACachingAllocator::raw_alloc_with_stream(bufferSize, stream));
         // devicePtr = reinterpret_cast<T*>(c10::cuda::CUDACachingAllocator::raw_alloc(bufferSize));
     }
 
@@ -741,13 +809,18 @@ struct RAIIRawDeviceBuffer {
 
     /// @brief Copy data from the host to the device
     /// @param hostData The host data to copy (assumes it points to a buffer of size bufferSize)
-    void setData(const T* hostData, bool blocking) {
-        cudaMemcpyAsync((void*) devicePtr, (const void*) hostData, bufferSize, cudaMemcpyHostToDevice, stream);
+    void
+    setData(const T *hostData, bool blocking) {
+        cudaMemcpyAsync((void *)devicePtr, (const void *)hostData, bufferSize,
+                        cudaMemcpyHostToDevice, stream);
         if (blocking) {
             C10_CUDA_CHECK(cudaStreamSynchronize(stream));
         }
-        // cudaMemcpy((void*) devicePtr, (const void*) hostData, bufferSize, cudaMemcpyHostToDevice);
+        // cudaMemcpy((void*) devicePtr, (const void*) hostData, bufferSize,
+        // cudaMemcpyHostToDevice);
     }
 };
 
-}  // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_CUDA_UTILS_CUH
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/nanovdb/ActiveVoxelIterator.h b/fvdb/src/detail/utils/nanovdb/ActiveVoxelIterator.h
index 6f229fb567..1b9824edd7 100644
--- a/fvdb/src/detail/utils/nanovdb/ActiveVoxelIterator.h
+++ b/fvdb/src/detail/utils/nanovdb/ActiveVoxelIterator.h
@@ -1,11 +1,11 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_NANOVDB_ACTIVEVOXELITERATOR_H
+#define FVDB_DETAIL_UTILS_NANOVDB_ACTIVEVOXELITERATOR_H
 
 #include "CustomAccessors.h"
 
-
 namespace fvdb {
 
 /*
@@ -15,12 +15,10 @@ namespace fvdb {
  *
  * FIXME: We should mvoe this inside ActiveVoxelIterator to not pollute the namespace
  */
-template <typename TreeT>
-struct ActiveVoxelIteratorDataTypeExtractor {
+template <typename TreeT> struct ActiveVoxelIteratorDataTypeExtractor {
     using DataType = typename TreeT::DataType;
 };
-template <>
-struct ActiveVoxelIteratorDataTypeExtractor<nanovdb::NanoTree<nanovdb::ValueOnIndex>> {
+template <> struct ActiveVoxelIteratorDataTypeExtractor<nanovdb::NanoTree<nanovdb::ValueOnIndex>> {
     using DataType = int64_t;
 };
 template <>
@@ -28,7 +26,6 @@ struct ActiveVoxelIteratorDataTypeExtractor<nanovdb::NanoTree<nanovdb::ValueOnIn
     using DataType = int64_t;
 };
 
-
 /*
  * Const iterator over voxels in a nanovdb Index grid
 
@@ -42,78 +39,89 @@ struct ActiveVoxelIteratorDataTypeExtractor<nanovdb::NanoTree<nanovdb::ValueOnIn
  *   ijk is the coordinate of the active voxel
  *   offset is the offset into the index grid
  */
-template <typename GridType, int64_t Offset=0>
-struct ActiveVoxelIterator {
+template <typename GridType, int64_t Offset = 0> struct ActiveVoxelIterator {
     // Iterator traits from std::iterator.
-    using TreeT = typename nanovdb::NanoTree<GridType>;
-    using DataType = typename ActiveVoxelIteratorDataTypeExtractor<TreeT>::DataType;
-    using value_type = std::pair<nanovdb::Coord, DataType>;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using TreeT             = typename nanovdb::NanoTree<GridType>;
+    using DataType          = typename ActiveVoxelIteratorDataTypeExtractor<TreeT>::DataType;
+    using value_type        = std::pair<nanovdb::Coord, DataType>;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     using LeafT = typename TreeT::LeafNodeType;
 
     ActiveVoxelIterator() = delete;
 
-    ActiveVoxelIterator(const nanovdb::NanoTree<GridType>& tree, bool ignoreMasked = false, int64_t baseOffset = 0) {
-        mLeaves = tree.template getFirstNode<0>();
-        mNumLeaves = tree.nodeCount(0);
-        mCurrentLeaf = 0;
+    ActiveVoxelIterator(const nanovdb::NanoTree<GridType> &tree, bool ignoreMasked = false,
+                        int64_t baseOffset = 0) {
+        mLeaves            = tree.template getFirstNode<0>();
+        mNumLeaves         = tree.nodeCount(0);
+        mCurrentLeaf       = 0;
         mCurrentLeafOffset = 0;
-        mIgnoreMasked = ignoreMasked;
-        mBaseOffset = baseOffset;
+        mIgnoreMasked      = ignoreMasked;
+        mBaseOffset        = baseOffset;
 
         // Move iterator to the first active voxel (or to the end if the tree is empty)
         moveToNextActiveVoxel();
     };
 
-    bool isValid() {
+    bool
+    isValid() {
         return mLeaves != nullptr;
     }
 
     // Dereferencable.
-    const value_type& operator*() const {
+    const value_type &
+    operator*() const {
         return mCurrentVoxelAndValue;
     }
 
-    const value_type* operator->() const {
-        return (const value_type*) &mCurrentVoxelAndValue;
+    const value_type *
+    operator->() const {
+        return (const value_type *)&mCurrentVoxelAndValue;
     }
 
-    const ActiveVoxelIterator& operator++() {
+    const ActiveVoxelIterator &
+    operator++() {
         mCurrentLeafOffset += 1;
         moveToNextActiveVoxel();
         return *this;
     }
 
-    ActiveVoxelIterator operator++(int) {
-        ActiveVoxelIterator tmp = *this; ++(*this); return tmp;
+    ActiveVoxelIterator
+    operator++(int) {
+        ActiveVoxelIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Equality / inequality.
-    bool operator==(const ActiveVoxelIterator& rhs) {
-        return mLeaves == rhs.mLeaves &&
-               mCurrentLeaf == rhs.mCurrentLeaf &&
+    bool
+    operator==(const ActiveVoxelIterator &rhs) {
+        return mLeaves == rhs.mLeaves && mCurrentLeaf == rhs.mCurrentLeaf &&
                mCurrentLeafOffset == rhs.mCurrentLeafOffset;
     }
-    bool operator!=(const ActiveVoxelIterator& rhs) {
+    bool
+    operator!=(const ActiveVoxelIterator &rhs) {
         return !(*this == rhs);
     }
 
-private:
-    void moveToNextActiveVoxel() {
+  private:
+    void
+    moveToNextActiveVoxel() {
         for (uint64_t li = mCurrentLeaf; li < mNumLeaves; li += 1) {
-            const LeafT& leaf = mLeaves[li];
+            const LeafT &leaf = mLeaves[li];
             for (uint32_t lo = mCurrentLeafOffset; lo < LeafT::NUM_VALUES; lo += 1) {
-                const bool isActive = mIgnoreMasked ? leaf.isActive(lo) : leaf.template get<fvdb::ActiveOrUnmasked<GridType>>(lo);
+                const bool isActive = mIgnoreMasked
+                                          ? leaf.isActive(lo)
+                                          : leaf.template get<fvdb::ActiveOrUnmasked<GridType>>(lo);
                 if (isActive) {
-                    mCurrentVoxelAndValue = std::make_pair(
-                        leaf.offsetToGlobalCoord(lo),
-                        (int64_t) leaf.getValue(lo) + Offset + mBaseOffset);
+                    mCurrentVoxelAndValue =
+                        std::make_pair(leaf.offsetToGlobalCoord(lo),
+                                       (int64_t)leaf.getValue(lo) + Offset + mBaseOffset);
 
                     mCurrentLeafOffset = lo;
-                    mCurrentLeaf = li;
+                    mCurrentLeaf       = li;
                     return;
                 }
             }
@@ -122,67 +130,73 @@ struct ActiveVoxelIterator {
         mLeaves = nullptr;
         return;
     }
-    const LeafT* mLeaves = nullptr;
-    uint32_t mCurrentLeafOffset = 0;
-    uint64_t mCurrentLeaf = 0;
-    uint64_t mNumLeaves = 0;
-    value_type mCurrentVoxelAndValue;
-    bool mIgnoreMasked = false;
-    int64_t mBaseOffset = 0;
+    const LeafT *mLeaves            = nullptr;
+    uint32_t     mCurrentLeafOffset = 0;
+    uint64_t     mCurrentLeaf       = 0;
+    uint64_t     mNumLeaves         = 0;
+    value_type   mCurrentVoxelAndValue;
+    bool         mIgnoreMasked = false;
+    int64_t      mBaseOffset   = 0;
 };
 
-
-template <typename GridType>
-struct ActiveVoxelIteratorIJKOnly {
-    using TreeT = typename nanovdb::NanoTree<GridType>;
-    using LeafT = typename nanovdb::NanoTree<GridType>::LeafNodeType;
+template <typename GridType> struct ActiveVoxelIteratorIJKOnly {
+    using TreeT                  = typename nanovdb::NanoTree<GridType>;
+    using LeafT                  = typename nanovdb::NanoTree<GridType>::LeafNodeType;
     ActiveVoxelIteratorIJKOnly() = delete;
-    ActiveVoxelIteratorIJKOnly(const TreeT& tree) {
-        mLeaves = tree.template getFirstNode<0>();
-        mNumLeaves = tree.nodeCount(0);
-        mCurrentLeaf = 0;
+    ActiveVoxelIteratorIJKOnly(const TreeT &tree) {
+        mLeaves            = tree.template getFirstNode<0>();
+        mNumLeaves         = tree.nodeCount(0);
+        mCurrentLeaf       = 0;
         mCurrentLeafOffset = 0;
         moveToNextActiveVoxel();
     };
 
-    bool isValid() {
+    bool
+    isValid() {
         return mLeaves != nullptr;
     }
 
     // Dereferencable.
-    const nanovdb::Coord& operator*() const {
+    const nanovdb::Coord &
+    operator*() const {
         return mCurrentIjk;
     }
 
-    const ActiveVoxelIteratorIJKOnly& operator++() {
+    const ActiveVoxelIteratorIJKOnly &
+    operator++() {
         mCurrentLeafOffset += 1;
         moveToNextActiveVoxel();
         return *this;
     }
 
-    ActiveVoxelIteratorIJKOnly operator++(int) {
-        ActiveVoxelIteratorIJKOnly tmp = *this; ++(*this); return tmp;
+    ActiveVoxelIteratorIJKOnly
+    operator++(int) {
+        ActiveVoxelIteratorIJKOnly tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     // Equality / inequality.
-    bool operator==(const ActiveVoxelIteratorIJKOnly& rhs) {
-        return mLeaves == rhs.mLeaves &&
-               mCurrentLeaf == rhs.mCurrentLeaf &&
+    bool
+    operator==(const ActiveVoxelIteratorIJKOnly &rhs) {
+        return mLeaves == rhs.mLeaves && mCurrentLeaf == rhs.mCurrentLeaf &&
                mCurrentLeafOffset == rhs.mCurrentLeafOffset;
     }
-    bool operator!=(const ActiveVoxelIteratorIJKOnly& rhs) {
+    bool
+    operator!=(const ActiveVoxelIteratorIJKOnly &rhs) {
         return !(*this == rhs);
     }
 
-private:
-    void moveToNextActiveVoxel() {
+  private:
+    void
+    moveToNextActiveVoxel() {
         for (uint64_t li = mCurrentLeaf; li < mNumLeaves; li += 1) {
-            const LeafT& leaf = mLeaves[li];
+            const LeafT &leaf = mLeaves[li];
             for (uint32_t lo = mCurrentLeafOffset; lo < LeafT::NUM_VALUES; lo += 1) {
                 if (leaf.isActive(lo)) {
-                    mCurrentIjk = leaf.offsetToGlobalCoord(lo);
+                    mCurrentIjk        = leaf.offsetToGlobalCoord(lo);
                     mCurrentLeafOffset = lo;
-                    mCurrentLeaf = li;
+                    mCurrentLeaf       = li;
                     return;
                 }
             }
@@ -190,10 +204,12 @@ struct ActiveVoxelIteratorIJKOnly {
         }
         mLeaves = nullptr;
     }
-    const LeafT* mLeaves = nullptr;
-    uint32_t mCurrentLeafOffset = 0;
-    uint64_t mCurrentLeaf = 0;
-    uint64_t mNumLeaves = 0;
+    const LeafT   *mLeaves            = nullptr;
+    uint32_t       mCurrentLeafOffset = 0;
+    uint64_t       mCurrentLeaf       = 0;
+    uint64_t       mNumLeaves         = 0;
     nanovdb::Coord mCurrentIjk;
 };
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_NANOVDB_ACTIVEVOXELITERATOR_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/nanovdb/CustomAccessors.h b/fvdb/src/detail/utils/nanovdb/CustomAccessors.h
index 0c2783c2bb..3ee0b39f61 100644
--- a/fvdb/src/detail/utils/nanovdb/CustomAccessors.h
+++ b/fvdb/src/detail/utils/nanovdb/CustomAccessors.h
@@ -1,111 +1,217 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_NANOVDB_CUSTOMACCESSORS_H
+#define FVDB_DETAIL_UTILS_NANOVDB_CUSTOMACCESSORS_H
 
 #include <nanovdb/NanoVDB.h>
 
 namespace fvdb {
 
-// NOTE: When getters/setters are called, you are guaranteed that ijk does not map to a child of that node!
+// NOTE: When getters/setters are called, you are guaranteed that ijk does not map to a child of
+// that node!
 
-
-
-///  @brief Get/Set operation which returns ot sets whether a voxel is unmasked (in the case of ValueOnIndexMask)
+///  @brief Get/Set operation which returns ot sets whether a voxel is unmasked (in the case of
+///  ValueOnIndexMask)
 ///         or active or not (in the case of ValueOnIndex)=
-template <typename BuildType>
-struct ActiveOrUnmasked;
-template <>
-struct ActiveOrUnmasked<nanovdb::ValueOnIndex>{
+template <typename BuildType> struct ActiveOrUnmasked;
+template <> struct ActiveOrUnmasked<nanovdb::ValueOnIndex> {
     using BuildT = nanovdb::ValueOnIndex;
-    __hostdev__ static bool get(const nanovdb::NanoRoot<BuildT>&) {return false;}
-    __hostdev__ static bool get(const typename nanovdb::NanoRoot<BuildT>::Tile& tile) { return (bool) tile.state; }
-    __hostdev__ static bool get(const nanovdb::NanoUpper<BuildT>&node, uint32_t n) {return node.mValueMask.isOn(n);}
-    __hostdev__ static bool get(const nanovdb::NanoLower<BuildT>&node, uint32_t n) {return node.mValueMask.isOn(n);}
-    __hostdev__ static bool get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mValueMask.isOn(n);}
+    __hostdev__ static bool
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return false;
+    }
+    __hostdev__ static bool
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &tile) {
+        return (bool)tile.state;
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoUpper<BuildT> &node, uint32_t n) {
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoLower<BuildT> &node, uint32_t n) {
+        return node.mValueMask.isOn(n);
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mValueMask.isOn(n);
+    }
 }; // ActiveOrUnmasked<BuildT>
-template <>
-struct ActiveOrUnmasked<nanovdb::ValueOnIndexMask>{
+template <> struct ActiveOrUnmasked<nanovdb::ValueOnIndexMask> {
     using BuildT = nanovdb::ValueOnIndexMask;
-    __hostdev__ static bool get(const nanovdb::NanoRoot<BuildT>&) {return false;}
-    __hostdev__ static bool get(const typename nanovdb::NanoRoot<BuildT>::Tile&) {return false;}
-    __hostdev__ static bool get(const nanovdb::NanoUpper<BuildT>&, uint32_t) {return false;}
-    __hostdev__ static bool get(const nanovdb::NanoLower<BuildT>&, uint32_t) {return false;}
-    __hostdev__ static bool get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mMask.isOn(n);}
-    __hostdev__ static void set(nanovdb::NanoRoot<nanovdb::ValueOnIndexMask>&, uint32_t) {}
-    __hostdev__ static void set(typename nanovdb::NanoRoot<nanovdb::ValueOnIndexMask>::Tile&, uint32_t) {}
-    __hostdev__ static void set(nanovdb::NanoUpper<BuildT>&, uint32_t) {}
-    __hostdev__ static void set(nanovdb::NanoLower<BuildT>&, uint32_t) {}
-    __hostdev__ static void set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {leaf.mMask.setOn(n);}
+    __hostdev__ static bool
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return false;
+    }
+    __hostdev__ static bool
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &) {
+        return false;
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoUpper<BuildT> &, uint32_t) {
+        return false;
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoLower<BuildT> &, uint32_t) {
+        return false;
+    }
+    __hostdev__ static bool
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mMask.isOn(n);
+    }
+    __hostdev__ static void
+    set(nanovdb::NanoRoot<nanovdb::ValueOnIndexMask> &, uint32_t) {}
+    __hostdev__ static void
+    set(typename nanovdb::NanoRoot<nanovdb::ValueOnIndexMask>::Tile &, uint32_t) {}
+    __hostdev__ static void
+    set(nanovdb::NanoUpper<BuildT> &, uint32_t) {}
+    __hostdev__ static void
+    set(nanovdb::NanoLower<BuildT> &, uint32_t) {}
+    __hostdev__ static void
+    set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        leaf.mMask.setOn(n);
+    }
 }; // ActiveOrUnmasked<BuildT>
 
-
 /// @brief Set operation to set the mask of a ValueOnIndexMask node to the given value
 struct AtomicMaskedStateSetOnlyHost {
     using BuildT = nanovdb::ValueOnIndexMask;
-    __hostdev__ static void set(nanovdb::NanoRoot<BuildT>&, bool) {}
-    __hostdev__ static void set(typename nanovdb::NanoRoot<BuildT>::Tile&, bool) {}
-    __hostdev__ static void set(nanovdb::NanoUpper<BuildT>&, uint32_t, bool) {}
-    __hostdev__ static void set(nanovdb::NanoLower<BuildT>&, uint32_t, bool) {}
-    __hostdev__ static void set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n, bool value) {leaf.mMask.set(n, value);}
+    __hostdev__ static void
+    set(nanovdb::NanoRoot<BuildT> &, bool) {}
+    __hostdev__ static void
+    set(typename nanovdb::NanoRoot<BuildT>::Tile &, bool) {}
+    __hostdev__ static void
+    set(nanovdb::NanoUpper<BuildT> &, uint32_t, bool) {}
+    __hostdev__ static void
+    set(nanovdb::NanoLower<BuildT> &, uint32_t, bool) {}
+    __hostdev__ static void
+    set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n, bool value) {
+        leaf.mMask.set(n, value);
+    }
 };
 
 #if defined(__CUDACC__)
 struct AtomicMaskedStateSetOnlyDevice {
     using BuildT = nanovdb::ValueOnIndexMask;
-    __device__ static void set(nanovdb::NanoRoot<BuildT>&, bool) {}
-    __device__ static void set(typename nanovdb::NanoRoot<BuildT>::Tile&, bool) {}
-    __device__ static void set(nanovdb::NanoUpper<BuildT>&, uint32_t, bool) {}
-    __device__ static void set(nanovdb::NanoLower<BuildT>&, uint32_t, bool) {}
-    __device__ static void set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n, bool value) {leaf.mMask.setAtomic(n, value);}
+    __device__ static void
+    set(nanovdb::NanoRoot<BuildT> &, bool) {}
+    __device__ static void
+    set(typename nanovdb::NanoRoot<BuildT>::Tile &, bool) {}
+    __device__ static void
+    set(nanovdb::NanoUpper<BuildT> &, uint32_t, bool) {}
+    __device__ static void
+    set(nanovdb::NanoLower<BuildT> &, uint32_t, bool) {}
+    __device__ static void
+    set(nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n, bool value) {
+        leaf.mMask.setAtomic(n, value);
+    }
 };
 #endif
 
-/// @brief Get/Set operation which returns the total number of unmasked voxels in a leaf node (in the case of ValueOnIndexMask)
+/// @brief Get/Set operation which returns the total number of unmasked voxels in a leaf node (in
+/// the case of ValueOnIndexMask)
 ///        and the total number of active voxels in a leaf node (in the case of ValueOnIndex)
-template <typename BuildType>
-struct TotalUnmaskedPerLeaf;
-template <>
-struct TotalUnmaskedPerLeaf<nanovdb::ValueOnIndexMask> {
+template <typename BuildType> struct TotalUnmaskedPerLeaf;
+template <> struct TotalUnmaskedPerLeaf<nanovdb::ValueOnIndexMask> {
     using BuildT = nanovdb::ValueOnIndexMask;
-    __hostdev__ static uint32_t get(const nanovdb::NanoRoot<BuildT>&) {return 0;}
-    __hostdev__ static uint32_t get(const typename nanovdb::NanoRoot<BuildT>::Tile&) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoUpper<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLower<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mMask.countOn();}
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoUpper<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLower<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mMask.countOn();
+    }
 };
-template <>
-struct TotalUnmaskedPerLeaf<nanovdb::ValueOnIndex> {
+template <> struct TotalUnmaskedPerLeaf<nanovdb::ValueOnIndex> {
     using BuildT = nanovdb::ValueOnIndex;
-    __hostdev__ static uint32_t get(const nanovdb::NanoRoot<BuildT>&) {return 0;}
-    __hostdev__ static uint32_t get(const typename nanovdb::NanoRoot<BuildT>::Tile&) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoUpper<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLower<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mValueMask.countOn();}
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoUpper<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLower<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mValueMask.countOn();
+    }
 };
 
-
-/// @brief Get/Set operation which returns the total number of unmasked voxels in a leaf node up to but excluding the n^th bit (in the case of ValueOnIndexMask)
-///        and the total number of active voxels up to but excluding the n^th bit in a leaf node (in the case of ValueOnIndex)
-template <typename BuildType>
-struct UnmaskedPerLeaf;
-template <>
-struct UnmaskedPerLeaf<nanovdb::ValueOnIndexMask>{
+/// @brief Get/Set operation which returns the total number of unmasked voxels in a leaf node up to
+/// but excluding the n^th bit (in the case of ValueOnIndexMask)
+///        and the total number of active voxels up to but excluding the n^th bit in a leaf node (in
+///        the case of ValueOnIndex)
+template <typename BuildType> struct UnmaskedPerLeaf;
+template <> struct UnmaskedPerLeaf<nanovdb::ValueOnIndexMask> {
     using BuildT = nanovdb::ValueOnIndexMask;
-    __hostdev__ static uint32_t get(const nanovdb::NanoRoot<BuildT>&) {return 0;}
-    __hostdev__ static uint32_t get(const typename nanovdb::NanoRoot<BuildT>::Tile&) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoUpper<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLower<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mMask.countOn(n);}
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoUpper<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLower<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mMask.countOn(n);
+    }
 };
-template <>
-struct UnmaskedPerLeaf<nanovdb::ValueOnIndex>{
+template <> struct UnmaskedPerLeaf<nanovdb::ValueOnIndex> {
     using BuildT = nanovdb::ValueOnIndex;
-    __hostdev__ static uint32_t get(const nanovdb::NanoRoot<BuildT>&) {return 0;}
-    __hostdev__ static uint32_t get(const typename nanovdb::NanoRoot<BuildT>::Tile&) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoUpper<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLower<BuildT>&, uint32_t) {return 0;}
-    __hostdev__ static uint32_t get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {return leaf.mValueMask.countOn(n);}
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoRoot<BuildT> &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const typename nanovdb::NanoRoot<BuildT>::Tile &) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoUpper<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLower<BuildT> &, uint32_t) {
+        return 0;
+    }
+    __hostdev__ static uint32_t
+    get(const nanovdb::NanoLeaf<BuildT> &leaf, uint32_t n) {
+        return leaf.mValueMask.countOn(n);
+    }
 };
 
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_NANOVDB_CUSTOMACCESSORS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/nanovdb/HDDAIterators.h b/fvdb/src/detail/utils/nanovdb/HDDAIterators.h
index 4cb05ab44a..a974fb5ce0 100644
--- a/fvdb/src/detail/utils/nanovdb/HDDAIterators.h
+++ b/fvdb/src/detail/utils/nanovdb/HDDAIterators.h
@@ -1,14 +1,16 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#pragma once
+#ifndef FVDB_DETAIL_UTILS_NANOVDB_HDDAITERATORS_H
+#define FVDB_DETAIL_UTILS_NANOVDB_HDDAITERATORS_H
+
+#include "CustomAccessors.h"
 
 #include <nanovdb/math/HDDA.h>
 #include <nanovdb/math/Ray.h>
-#include <c10/util/Half.h>
-#include <ATen/OpMathType.h>
 
-#include "CustomAccessors.h"
+#include <ATen/OpMathType.h>
+#include <c10/util/Half.h>
 
 #include <iostream>
 
@@ -16,10 +18,11 @@ namespace nanovdb {
 
 namespace math {
 
-template<>
-struct Delta<c10::Half>
-{
-    __hostdev__ static c10::Half value() { return c10::Half(1e-3f); }
+template <> struct Delta<c10::Half> {
+    __hostdev__ static c10::Half
+    value() {
+        return c10::Half(1e-3f);
+    }
 };
 
 } // namespace math
@@ -28,69 +31,68 @@ struct Delta<c10::Half>
 
 namespace fvdb {
 
-template <typename AccT, typename ScalarT>
-struct HDDASegmentIterator {
-public:
-    using BuildT = typename AccT::BuildType;
-    using MathType = at::opmath_type<ScalarT>;
-    using RayT = nanovdb::math::Ray<ScalarT>;
+template <typename AccT, typename ScalarT> struct HDDASegmentIterator {
+  public:
+    using BuildT       = typename AccT::BuildType;
+    using MathType     = at::opmath_type<ScalarT>;
+    using RayT         = nanovdb::math::Ray<ScalarT>;
     using RayTInternal = nanovdb::math::Ray<MathType>;
-    using TimespanT = typename RayTInternal::TimeSpan;
-    using CoordT = nanovdb::Coord;
-    using HDDAT = nanovdb::math::HDDA<RayTInternal, nanovdb::Coord>;
+    using TimespanT    = typename RayTInternal::TimeSpan;
+    using CoordT       = nanovdb::Coord;
+    using HDDAT        = nanovdb::math::HDDA<RayTInternal, nanovdb::Coord>;
 
-    using value_type = TimespanT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = TimespanT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     HDDASegmentIterator() = delete;
 
-    __hostdev__
-    bool isValid() const {
+    __hostdev__ bool
+    isValid() const {
         return mTimespan.valid(0.0);
     }
 
-    __hostdev__
-    const HDDASegmentIterator& operator++() {
+    __hostdev__ const HDDASegmentIterator &
+    operator++() {
         nextSegment();
         return *this;
     }
 
-    __hostdev__
-    HDDASegmentIterator operator++(int) {
-        HDDASegmentIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ HDDASegmentIterator
+    operator++(int) {
+        HDDASegmentIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
     __hostdev__
-    HDDASegmentIterator(const RayT& rayVox, const AccT& acc, const bool ignoreMasked) : mAcc(acc) {
+    HDDASegmentIterator(const RayT &rayVox, const AccT &acc, const bool ignoreMasked)
+        : mAcc(acc) {
         mIgnoreMasked = ignoreMasked;
-        mRay = RayTInternal(
-            nanovdb::math::Vec3<MathType>(rayVox.eye()),
-            nanovdb::math::Vec3<MathType>(rayVox.dir()),
-	    static_cast<MathType>(rayVox.t0()),
-	    static_cast<MathType>(rayVox.t1())
-        );
-        CoordT ijk = nanovdb::math::RoundDown<CoordT>(rayVox(mRay.t0() + nanovdb::math::Delta<ScalarT>::value()));
+        mRay          = RayTInternal(nanovdb::math::Vec3<MathType>(rayVox.eye()),
+                                     nanovdb::math::Vec3<MathType>(rayVox.dir()),
+                                     static_cast<MathType>(rayVox.t0()), static_cast<MathType>(rayVox.t1()));
+        CoordT ijk    = nanovdb::math::RoundDown<CoordT>(
+            rayVox(mRay.t0() + nanovdb::math::Delta<ScalarT>::value()));
         mHdda.init(mRay, mAcc.getDim(ijk, mRay));
         nextSegment(); // Move to first segment
     }
 
     // Dereferencable.
-    __hostdev__
-    const value_type& operator*() const {
+    __hostdev__ const value_type &
+    operator*() const {
         return mTimespan;
     }
 
-    __hostdev__
-    const value_type* operator->() const {
-        return (const value_type*) &mTimespan;
+    __hostdev__ const value_type *
+    operator->() const {
+        return (const value_type *)&mTimespan;
     }
 
-private:
-
-    __hostdev__
-    bool nextSegment() {
+  private:
+    __hostdev__ bool
+    nextSegment() {
         mTimespan.t0 = mRay.t1() + static_cast<ScalarT>(5.0);
         mTimespan.t1 = mRay.t1();
         do {
@@ -103,100 +105,98 @@ struct HDDASegmentIterator {
                 mHdda.update(mRay, dim);
             }
 
-            const bool isActive = mIgnoreMasked ? mAcc.isActive(mHdda.voxel()) : mAcc.template get<fvdb::ActiveOrUnmasked<BuildT>>(mHdda.voxel());
-            if (isActive) {  // We're inside an active region
+            const bool isActive =
+                mIgnoreMasked ? mAcc.isActive(mHdda.voxel())
+                              : mAcc.template get<fvdb::ActiveOrUnmasked<BuildT>>(mHdda.voxel());
+            if (isActive) {               // We're inside an active region
                 if (!mTimespan.valid()) { // This is the first hit
                     mTimespan.t0 = mHdda.time();
                 }
-            } else {  // We're not in an active region
+            } else {                      // We're not in an active region
                 if (mTimespan.valid()) {  // We were just in an active region
                     mTimespan.t1 = mHdda.time();
                     break;
                 }
             }
-        } while(mHdda.step());
+        } while (mHdda.step());
 
         if (!mTimespan.valid(0.0)) {
             mTimespan.t1 = fminf(mRay.t1(), mHdda.time());
-
         }
         // We didn't hit anything, return
         return mTimespan.valid(0.0);
     }
 
-    const AccT& mAcc;
+    const AccT  &mAcc;
     RayTInternal mRay;
-    HDDAT mHdda;
-    TimespanT mTimespan;
-    bool mIgnoreMasked;
+    HDDAT        mHdda;
+    TimespanT    mTimespan;
+    bool         mIgnoreMasked;
 };
 
-
-template <typename AccT, typename ScalarT>
-struct HDDAVoxelIterator {
+template <typename AccT, typename ScalarT> struct HDDAVoxelIterator {
     using MathType = at::opmath_type<ScalarT>;
     struct PairT {
-        nanovdb::Coord first;
+        nanovdb::Coord                                  first;
         typename nanovdb::math::Ray<MathType>::TimeSpan second;
     };
-    using BuildT = typename AccT::BuildType;
-    using RayT = nanovdb::math::Ray<ScalarT>;
+    using BuildT       = typename AccT::BuildType;
+    using RayT         = nanovdb::math::Ray<ScalarT>;
     using RayTInternal = nanovdb::math::Ray<MathType>;
-    using TimespanT = typename RayTInternal::TimeSpan;
-    using CoordT = nanovdb::Coord;
-    using HDDAT = nanovdb::math::HDDA<RayTInternal, nanovdb::Coord>;
+    using TimespanT    = typename RayTInternal::TimeSpan;
+    using CoordT       = nanovdb::Coord;
+    using HDDAT        = nanovdb::math::HDDA<RayTInternal, nanovdb::Coord>;
 
-    using value_type = PairT;
-    using pointer = value_type*;
-    using reference = value_type&;
+    using value_type        = PairT;
+    using pointer           = value_type *;
+    using reference         = value_type &;
     using iterator_category = std::forward_iterator_tag;
 
     HDDAVoxelIterator() = delete;
 
     __hostdev__
-    HDDAVoxelIterator(const RayT& rayVox, const AccT& acc) : mAcc(acc) {
-        mRay = RayTInternal(
-            nanovdb::math::Vec3<MathType>(rayVox.eye()),
-            nanovdb::math::Vec3<MathType>(rayVox.dir()),
-            static_cast<MathType>(rayVox.t0()),
-            static_cast<MathType>(rayVox.t1())
-        );
+    HDDAVoxelIterator(const RayT &rayVox, const AccT &acc)
+        : mAcc(acc) {
+        mRay = RayTInternal(nanovdb::math::Vec3<MathType>(rayVox.eye()),
+                            nanovdb::math::Vec3<MathType>(rayVox.dir()),
+                            static_cast<MathType>(rayVox.t0()), static_cast<MathType>(rayVox.t1()));
 
         CoordT ijk = mRay(mRay.t0() + nanovdb::math::Delta<ScalarT>::value()).floor();
         mHdda.init(mRay, mAcc.getDim(ijk, mRay));
         mIsValid = nextVoxel();
     }
 
-    __hostdev__
-    bool isValid() const {
+    __hostdev__ bool
+    isValid() const {
         return mIsValid;
     }
 
-    __hostdev__
-    const value_type& operator*() const {
+    __hostdev__ const value_type &
+    operator*() const {
         return mData;
     }
 
-    __hostdev__
-    const value_type* operator->() const {
-        return (const value_type*) &mData;
+    __hostdev__ const value_type *
+    operator->() const {
+        return (const value_type *)&mData;
     }
 
-    __hostdev__
-    const HDDAVoxelIterator& operator++() {
+    __hostdev__ const HDDAVoxelIterator &
+    operator++() {
         mIsValid = nextVoxel();
         return *this;
     }
 
-    __hostdev__
-    HDDAVoxelIterator operator++(int) {
-        HDDAVoxelIterator tmp = *this; ++(*this); return tmp;
+    __hostdev__ HDDAVoxelIterator
+    operator++(int) {
+        HDDAVoxelIterator tmp = *this;
+        ++(*this);
+        return tmp;
     }
 
-private:
-
-    __hostdev__
-    bool nextVoxel() {
+  private:
+    __hostdev__ bool
+    nextVoxel() {
         do {
             // Coordinate of the current voxel
             const int dim = mAcc.getDim(mHdda.voxel(), mRay);
@@ -207,22 +207,25 @@ struct HDDAVoxelIterator {
                 mHdda.update(mRay, dim);
             }
             // NOTE: This will return true if a tile is active
-            if (mAcc.template get<fvdb::ActiveOrUnmasked<BuildT>>(mHdda.voxel())) {  // We hit an active voxel, increment hdda and return
+            if (mAcc.template get<fvdb::ActiveOrUnmasked<BuildT>>(
+                    mHdda.voxel())) { // We hit an active voxel, increment hdda and return
                 mData = { mHdda.voxel(), TimespanT(mHdda.time(), mHdda.next()) };
                 mHdda.step();
                 return true;
             }
-        } while(mHdda.step());
+        } while (mHdda.step());
 
         // We didn't find any active voxels, return
         return false;
     }
 
-    bool mIsValid = false;
-    const AccT& mAcc;
+    bool         mIsValid = false;
+    const AccT  &mAcc;
     RayTInternal mRay;
-    HDDAT mHdda;
-    value_type mData;
+    HDDAT        mHdda;
+    value_type   mData;
 };
 
 } // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_NANOVDB_HDDAITERATORS_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/nanovdb/Printing.h b/fvdb/src/detail/utils/nanovdb/Printing.h
index 2b4266da63..9e55a816dd 100644
--- a/fvdb/src/detail/utils/nanovdb/Printing.h
+++ b/fvdb/src/detail/utils/nanovdb/Printing.h
@@ -1,17 +1,22 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <iostream>
+#ifndef FVDB_DETAIL_UTILS_NANOVDB_PRINTING_H
+#define FVDB_DETAIL_UTILS_NANOVDB_PRINTING_H
+
 #include <nanovdb/NanoVDB.h>
-#include <nanovdb/math/Ray.h>
 #include <nanovdb/io/IO.h>
+#include <nanovdb/math/Ray.h>
 
+#include <iostream>
 
 // Make std::cout/std::cerr work with nanovdb types
-template<typename T>
-inline std::ostream& operator<<(std::ostream& os, const nanovdb::math::Ray<T>& r) {
-    os << "eye=" << r.eye() << " dir=" << r.dir() << " 1/dir="<<r.invDir()
-       << " t0=" << r.t0()  << " t1="  << r.t1();
+template <typename T>
+inline std::ostream &
+operator<<(std::ostream &os, const nanovdb::math::Ray<T> &r) {
+    os << "eye=" << r.eye() << " dir=" << r.dir() << " 1/dir=" << r.invDir() << " t0=" << r.t0()
+       << " t1=" << r.t1();
     return os;
 }
 
+#endif // FVDB_DETAIL_UTILS_NANOVDB_PRINTING_H
\ No newline at end of file
diff --git a/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h b/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
index 05f66a0e48..5d1629bfba 100644
--- a/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
+++ b/fvdb/src/detail/utils/nanovdb/TorchNanoConversions.h
@@ -1,79 +1,90 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
+#ifndef FVDB_DETAIL_UTILS_NANOVDB_TORCHNANOCONVERSIONS_H
+#define FVDB_DETAIL_UTILS_NANOVDB_TORCHNANOCONVERSIONS_H
+
 #include <nanovdb/NanoVDB.h>
 #include <torch/torch.h>
 
-
 namespace fvdb {
 
 /// @brief Convert a torch tensor with exactly 3 elements into a nanovdb::Vec3d
 /// @param inVec3Tensor A torch tensor containing exactly 3 elements of any type.
 /// @return A nanovdb::Vec3d with the same values as the input tensor casted to double.
-inline nanovdb::Vec3d tensorToVec3d(const torch::Tensor& inVec3Tensor) {
+inline nanovdb::Vec3d
+tensorToVec3d(const torch::Tensor &inVec3Tensor) {
     const torch::Tensor vec3Tensor = inVec3Tensor.squeeze().cpu();
     TORCH_CHECK(vec3Tensor.numel() == 3, "tensor must be a vec3");
     TORCH_CHECK(vec3Tensor.size(0) == 3, "tensor must be a vec3");
 
-    return nanovdb::Vec3d(vec3Tensor[0].item().toDouble(),
-                          vec3Tensor[1].item().toDouble(),
+    return nanovdb::Vec3d(vec3Tensor[0].item().toDouble(), vec3Tensor[1].item().toDouble(),
                           vec3Tensor[2].item().toDouble());
 }
 
-/// @brief Convert a torch tensor with exactly 3 integral-type (int, long, etc...) elements into a nanovdb::Coord
+/// @brief Convert a torch tensor with exactly 3 integral-type (int, long, etc...) elements into a
+/// nanovdb::Coord
 /// @param inVec3Tensor A torch tensor containing exactly 3 elements of an integral type.
 /// @return A nanovdb::Coord with the same values as the input tensor casted to long.
-inline nanovdb::Coord tensorToCoord(const torch::Tensor& inVec3Tensor) {
+inline nanovdb::Coord
+tensorToCoord(const torch::Tensor &inVec3Tensor) {
     const torch::Tensor vec3Tensor = inVec3Tensor.squeeze().cpu();
     TORCH_CHECK(vec3Tensor.numel() == 3, "tensor must be a vec3");
     TORCH_CHECK(vec3Tensor.size(0) == 3, "tensor must be a vec3");
-    TORCH_CHECK(at::isIntegralType(vec3Tensor.scalar_type(), false /*includeBool*/), "tensor must have an integer type");
+    TORCH_CHECK(at::isIntegralType(vec3Tensor.scalar_type(), false /*includeBool*/),
+                "tensor must have an integer type");
 
-    return nanovdb::Coord(vec3Tensor[0].item().toLong(),
-                          vec3Tensor[1].item().toLong(),
+    return nanovdb::Coord(vec3Tensor[0].item().toLong(), vec3Tensor[1].item().toLong(),
                           vec3Tensor[2].item().toLong());
 }
 
-/// @brief Convert a torch tensor with exactly 4 integral-type (int, long, etc...) elements into a nanovdb::Vec4i
+/// @brief Convert a torch tensor with exactly 4 integral-type (int, long, etc...) elements into a
+/// nanovdb::Vec4i
 /// @param inVec4Tensor A torch tensor containing exactly 4 elements of an integral type.
 /// @return A nanovdb::Vec4i with the same values as the input tensor casted to long.
-inline nanovdb::Vec4i tensorToCoord4(const torch::Tensor& inVec3Tensor) {
+inline nanovdb::Vec4i
+tensorToCoord4(const torch::Tensor &inVec3Tensor) {
     const torch::Tensor vec3Tensor = inVec3Tensor.squeeze().cpu();
     TORCH_CHECK(vec3Tensor.numel() == 4, "tensor must be a vec4");
     TORCH_CHECK(vec3Tensor.size(0) == 4, "tensor must be a vec4");
-    TORCH_CHECK(at::isIntegralType(vec3Tensor.scalar_type(), false /*includeBool*/), "tensor must have an integer type");
+    TORCH_CHECK(at::isIntegralType(vec3Tensor.scalar_type(), false /*includeBool*/),
+                "tensor must have an integer type");
 
-    return nanovdb::Vec4i(vec3Tensor[0].item().toLong(),
-                          vec3Tensor[1].item().toLong(),
-                          vec3Tensor[2].item().toLong(),
-                          vec3Tensor[3].item().toLong());
+    return nanovdb::Vec4i(vec3Tensor[0].item().toLong(), vec3Tensor[1].item().toLong(),
+                          vec3Tensor[2].item().toLong(), vec3Tensor[3].item().toLong());
 }
 
 /// @brief Convert a nanovdb::coord into a (cpu) torch tensor with exactly 3 long elements
 /// @param inCoord The nanovdb::Coord to convert
 /// @return A torch tensor with exactly 3 long elements
-inline torch::Tensor coordToTensor(const nanovdb::Coord& inCoord) {
-    auto opts = torch::TensorOptions().dtype(torch::kLong);
-    torch::Tensor ret = torch::empty(3, opts);
-    auto acc = ret.accessor<int64_t, 1>();
-    for (int i = 0; i < 3; i += 1) { acc[i] = inCoord[i]; }
+inline torch::Tensor
+coordToTensor(const nanovdb::Coord &inCoord) {
+    auto          opts = torch::TensorOptions().dtype(torch::kLong);
+    torch::Tensor ret  = torch::empty(3, opts);
+    auto          acc  = ret.accessor<int64_t, 1>();
+    for (int i = 0; i < 3; i += 1) {
+        acc[i] = inCoord[i];
+    }
     return ret;
 }
 
-
-/// @brief Convert a tensor of shape [B, 3] or [3] into a vector of nanovdb::Vec3d of length B. If the input tensor has
+/// @brief Convert a tensor of shape [B, 3] or [3] into a vector of nanovdb::Vec3d of length B. If
+/// the input tensor has
 ///        shape [3,], then it is duplicated B times.
 /// @param batchSize The size of the batch
 /// @param vec3ToConvert A tensor of shape [3,] or [B, 3]
 /// @param allowNegative If true, then negative values are allowed in the tensor
 /// @return A vector of nanovdb::Vec3d of length B
-inline std::vector<nanovdb::Vec3d> tensorToVec3dBatch(int64_t batchSize, const torch::Tensor& vec3ToConvert, bool allowNegative = true, std::string name = "tensor") {
-    torch::Tensor vec3In = vec3ToConvert.squeeze();
+inline std::vector<nanovdb::Vec3d>
+tensorToVec3dBatch(int64_t batchSize, const torch::Tensor &vec3ToConvert, bool allowNegative = true,
+                   std::string name = "tensor") {
+    torch::Tensor               vec3In = vec3ToConvert.squeeze();
     std::vector<nanovdb::Vec3d> returnVec;
     returnVec.reserve(batchSize);
     if (vec3In.dim() == 1) {
-        TORCH_CHECK_VALUE(vec3In.size(0) == 3, "Expected ", name, " to have shape [3,] or [B, 3] but got shape = [" +
-                    std::to_string(vec3In.size(0)) + ",]");
+        TORCH_CHECK_VALUE(vec3In.size(0) == 3, "Expected ", name,
+                          " to have shape [3,] or [B, 3] but got shape = [" +
+                              std::to_string(vec3In.size(0)) + ",]");
         const nanovdb::Vec3d voxS = fvdb::tensorToVec3d(vec3In);
         if (!allowNegative) {
             TORCH_CHECK_VALUE(voxS[0] > 0, "voxelSize[0] must be > 0");
@@ -84,10 +95,14 @@ inline std::vector<nanovdb::Vec3d> tensorToVec3dBatch(int64_t batchSize, const t
             returnVec.push_back(voxS);
         }
     } else if (vec3In.dim() == 2) {
-        TORCH_CHECK(vec3In.size(0) == batchSize, "Expected ", name, " to have shape [3,] or [B, 3] but got shape = [" +
-                    std::to_string(vec3In.size(0)) + ", " + std::to_string(vec3In.size(1)) + "]");
-        TORCH_CHECK(vec3In.size(0) == batchSize, "Expected ", name, " to have shape [3,] or [B, 3] but got shape = [" +
-                    std::to_string(vec3In.size(0)) + ", " + std::to_string(vec3In.size(1)) + "]");
+        TORCH_CHECK(vec3In.size(0) == batchSize, "Expected ", name,
+                    " to have shape [3,] or [B, 3] but got shape = [" +
+                        std::to_string(vec3In.size(0)) + ", " + std::to_string(vec3In.size(1)) +
+                        "]");
+        TORCH_CHECK(vec3In.size(0) == batchSize, "Expected ", name,
+                    " to have shape [3,] or [B, 3] but got shape = [" +
+                        std::to_string(vec3In.size(0)) + ", " + std::to_string(vec3In.size(1)) +
+                        "]");
         for (int64_t i = 0; i < batchSize; ++i) {
             const nanovdb::Vec3d voxS = fvdb::tensorToVec3d(vec3In[i]);
             if (!allowNegative) {
@@ -101,4 +116,6 @@ inline std::vector<nanovdb::Vec3d> tensorToVec3dBatch(int64_t batchSize, const t
     return returnVec;
 }
 
-} // namespace fvdb
\ No newline at end of file
+} // namespace fvdb
+
+#endif // FVDB_DETAIL_UTILS_NANOVDB_TORCHNANOCONVERSIONS_H
\ No newline at end of file
diff --git a/fvdb/src/python/Bindings.cpp b/fvdb/src/python/Bindings.cpp
index f10a60bf3f..e0a7e9fc2a 100644
--- a/fvdb/src/python/Bindings.cpp
+++ b/fvdb/src/python/Bindings.cpp
@@ -1,54 +1,52 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
+#include "TypeCasters.h"
 
-#include "FVDB.h"
-#include "Config.h"
+#include <Config.h>
+#include <FVDB.h>
+
+#include <torch/extension.h>
 
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
-#include "TypeCasters.h"
-
-void bind_grid_batch(py::module& m);
-void bind_jagged_tensor(py::module& m);
-
-
-#define __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, LSHAPE_TYPE) \
-      m.def(FUNC_STR,    \
-            [](const LSHAPE_TYPE& lshape, \
-               c10::optional<const std::vector<int64_t>>& rshape, \
-               c10::optional<torch::ScalarType> dtype, \
-               c10::optional<fvdb::TorchDeviceOrString> device, \
-               bool requires_grad, bool pin_memory) { \
-                  const torch::Device device_ = device.value_or(fvdb::TorchDeviceOrString(torch::kCPU)).value(); \
-                  const torch::ScalarType dtype_ = dtype.value_or(torch::kFloat32); \
-                  const torch::TensorOptions opts = torch::TensorOptions().dtype(dtype_) \
-                                                                          .device(device_) \
-                                                                          .requires_grad(requires_grad) \
-                                                                          .pinned_memory(pin_memory); \
-                  const std::vector<int64_t> rshape_ = rshape.value_or(std::vector<int64_t>()); \
-                  return fvdb::FUNC_NAME(lshape, rshape_, opts); \
-               }, \
-            py::arg("lshape"), py::arg("rshape") = c10::nullopt, \
-            py::arg("dtype") = c10::nullopt, py::arg("device") = c10::nullopt, \
-            py::arg("requires_grad") = false, py::arg("pin_memory") = false);
-#define __FVDB__BUILDER(FUNC_NAME, FUNC_STR) __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, std::vector<int64_t>) \
-                                             __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, std::vector<std::vector<int64_t>>)
-void bind_jt_build_functions(py::module& m) {
-    __FVDB__BUILDER(jrand, "jrand")
-    __FVDB__BUILDER(jrandn, "jrandn")
-    __FVDB__BUILDER(jzeros, "jzeros")
-    __FVDB__BUILDER(jones, "jones")
-    __FVDB__BUILDER(jones, "jempty")
+void bind_grid_batch(py::module &m);
+void bind_jagged_tensor(py::module &m);
+
+#define __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, LSHAPE_TYPE)                                    \
+    m.def(                                                                                         \
+        FUNC_STR,                                                                                  \
+        [](const LSHAPE_TYPE &lshape, c10::optional<const std::vector<int64_t>> &rshape,           \
+           c10::optional<torch::ScalarType>         dtype,                                         \
+           c10::optional<fvdb::TorchDeviceOrString> device, bool requires_grad, bool pin_memory) { \
+            const torch::Device device_ =                                                          \
+                device.value_or(fvdb::TorchDeviceOrString(torch::kCPU)).value();                   \
+            const torch::ScalarType    dtype_ = dtype.value_or(torch::kFloat32);                   \
+            const torch::TensorOptions opts   = torch::TensorOptions()                             \
+                                                  .dtype(dtype_)                                   \
+                                                  .device(device_)                                 \
+                                                  .requires_grad(requires_grad)                    \
+                                                  .pinned_memory(pin_memory);                      \
+            const std::vector<int64_t> rshape_ = rshape.value_or(std::vector<int64_t>());          \
+            return fvdb::FUNC_NAME(lshape, rshape_, opts);                                         \
+        },                                                                                         \
+        py::arg("lshape"), py::arg("rshape") = c10::nullopt, py::arg("dtype") = c10::nullopt,      \
+        py::arg("device") = c10::nullopt, py::arg("requires_grad") = false,                        \
+        py::arg("pin_memory") = false);
+#define __FVDB__BUILDER(FUNC_NAME, FUNC_STR)                         \
+    __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, std::vector<int64_t>) \
+    __FVDB__BUILDER_INNER(FUNC_NAME, FUNC_STR, std::vector<std::vector<int64_t>>)
+void
+bind_jt_build_functions(py::module &m){
+    __FVDB__BUILDER(jrand, "jrand") __FVDB__BUILDER(jrandn, "jrandn")
+        __FVDB__BUILDER(jzeros, "jzeros") __FVDB__BUILDER(jones, "jones")
+            __FVDB__BUILDER(jones, "jempty")
 
 }
 #undef __FVDB__BUILDER_INNER
 #undef __FVDB__BUILDER
 
-
-
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     // Print types when the user passes in the wrong type
     py::class_<fvdb::Vec3i>(m, "Vec3i");
@@ -71,14 +69,12 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
     // volume rendering
     // TODO: (@fwilliams) JaggedTensor interface
-    m.def("volume_render", &fvdb::volumeRender,
-          py::arg("sigmas"), py::arg("rgbs"),
-          py::arg("deltaTs"), py::arg("ts"),
-          py::arg("packInfo"), py::arg("transmittanceThresh"));
+    m.def("volume_render", &fvdb::volumeRender, py::arg("sigmas"), py::arg("rgbs"),
+          py::arg("deltaTs"), py::arg("ts"), py::arg("packInfo"), py::arg("transmittanceThresh"));
 
     // attention
-    m.def("scaled_dot_product_attention", &fvdb::scaledDotProductAttention,
-          py::arg("query"), py::arg("key"), py::arg("value"), py::arg("scale"), R"_FVDB_(
+    m.def("scaled_dot_product_attention", &fvdb::scaledDotProductAttention, py::arg("query"),
+          py::arg("key"), py::arg("value"), py::arg("scale"), R"_FVDB_(
       Computes scaled dot product attention on query, key and value tensors.
             Different SDP kernels could be chosen similar to
             https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
@@ -95,83 +91,69 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
             out (JaggedTensor): Attention result of shape [B, -1, H, V].)_FVDB_");
 
     // Concatenate grids or jagged tensors
-    m.def("jcat", py::overload_cast<const std::vector<fvdb::GridBatch>&>(&fvdb::jcat), py::arg("grid_batches"));
-    m.def("jcat", py::overload_cast<const std::vector<fvdb::JaggedTensor>&, torch::optional<int64_t>>(&fvdb::jcat), py::arg("jagged_tensors"), py::arg("dim") = torch::nullopt);
-
-    // Build a jagged tensor from a grid batch or another jagged tensor and a data tensor. They will have the same offset structure
-    // m.def("jagged_like", py::overload_cast<fvdb::JaggedTensor, torch::Tensor>(&fvdb::jagged_like), py::arg("like"), py::arg("data"));
-    // m.def("jagged_like", py::overload_cast<fvdb::GridBatch, torch::Tensor>(&fvdb::jagged_like), py::arg("like"), py::arg("data"));
+    m.def("jcat", py::overload_cast<const std::vector<fvdb::GridBatch> &>(&fvdb::jcat),
+          py::arg("grid_batches"));
+    m.def("jcat",
+          py::overload_cast<const std::vector<fvdb::JaggedTensor> &, torch::optional<int64_t>>(
+              &fvdb::jcat),
+          py::arg("jagged_tensors"), py::arg("dim") = torch::nullopt);
+
+    // Build a jagged tensor from a grid batch or another jagged tensor and a data tensor. They will
+    // have the same offset structure m.def("jagged_like", py::overload_cast<fvdb::JaggedTensor,
+    // torch::Tensor>(&fvdb::jagged_like), py::arg("like"), py::arg("data")); m.def("jagged_like",
+    // py::overload_cast<fvdb::GridBatch, torch::Tensor>(&fvdb::jagged_like), py::arg("like"),
+    // py::arg("data"));
 
     // Static grid construction
-    m.def("sparse_grid_from_points", &fvdb::sparse_grid_from_points,
-          py::arg("points"),
-          py::arg("pad_min") = torch::zeros({3}, torch::kInt32),
-          py::arg("pad_max") = torch::zeros({3}, torch::kInt32),
-          py::arg("voxel_sizes") = 1.0,
-          py::arg("origins") = torch::zeros({3}),
-          py::arg("mutable") = false);
-    m.def("sparse_grid_from_nearest_voxels_to_points", &fvdb::sparse_grid_from_nearest_voxels_to_points,
-          py::arg("points"),
-          py::arg("voxel_sizes") = 1.0,
-          py::arg("origins") = torch::zeros({3}),
-          py::arg("mutable") = false);
-    m.def("sparse_grid_from_ijk", &fvdb::sparse_grid_from_ijk,
-          py::arg("ijk"),
-          py::arg("pad_min") = torch::zeros({3}, torch::kInt32),
-          py::arg("pad_max") = torch::zeros({3}, torch::kInt32),
-          py::arg("voxel_sizes") = 1.0,
-          py::arg("origins") = torch::zeros({3}),
+    m.def("sparse_grid_from_points", &fvdb::sparse_grid_from_points, py::arg("points"),
+          py::arg("pad_min") = torch::zeros({ 3 }, torch::kInt32),
+          py::arg("pad_max") = torch::zeros({ 3 }, torch::kInt32), py::arg("voxel_sizes") = 1.0,
+          py::arg("origins") = torch::zeros({ 3 }), py::arg("mutable") = false);
+    m.def("sparse_grid_from_nearest_voxels_to_points",
+          &fvdb::sparse_grid_from_nearest_voxels_to_points, py::arg("points"),
+          py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros({ 3 }),
           py::arg("mutable") = false);
-    m.def("sparse_grid_from_dense", &fvdb::sparse_grid_from_dense,
-          py::arg("num_grids"),
-          py::arg("dense_dims"),
-          py::arg("ijk_min") = torch::zeros(3, torch::kInt32),
-          py::arg("voxel_sizes") = 1.0,
-          py::arg("origins") = torch::zeros({3}),
-          py::arg("mask") = nullptr,
-          py::arg("device") = "cpu",
-          py::arg("mutable") = false);
-    m.def("sparse_grid_from_mesh", &fvdb::sparse_grid_from_mesh,
-          py::arg("vertices"),
-          py::arg("faces"),
-          py::arg("voxel_sizes") = 1.0,
-          py::arg("origins") = torch::zeros({3}),
+    m.def("sparse_grid_from_ijk", &fvdb::sparse_grid_from_ijk, py::arg("ijk"),
+          py::arg("pad_min") = torch::zeros({ 3 }, torch::kInt32),
+          py::arg("pad_max") = torch::zeros({ 3 }, torch::kInt32), py::arg("voxel_sizes") = 1.0,
+          py::arg("origins") = torch::zeros({ 3 }), py::arg("mutable") = false);
+    m.def("sparse_grid_from_dense", &fvdb::sparse_grid_from_dense, py::arg("num_grids"),
+          py::arg("dense_dims"), py::arg("ijk_min") = torch::zeros(3, torch::kInt32),
+          py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros({ 3 }),
+          py::arg("mask") = nullptr, py::arg("device") = "cpu", py::arg("mutable") = false);
+    m.def("sparse_grid_from_mesh", &fvdb::sparse_grid_from_mesh, py::arg("vertices"),
+          py::arg("faces"), py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros({ 3 }),
           py::arg("mutable") = false);
 
     // Loading and saving grids
-    m.def("load",
-          &fvdb::load, py::arg("path"),
-          py::arg("grid_id") = py::none(),
-          py::arg("device") = "cpu",
-          py::arg("verbose") = false);
-    m.def("save",
-          &fvdb::save, py::arg("path"),
-          py::arg("grid_batch"),
-          py::arg("data") = py::none(),
-          py::arg("names") = py::none(),
-          py::arg("compressed") = false,
-          py::arg("verbose") = false);
-
-/*
-          py::overload_cast<const std::vector<int64_t>&,
-                            c10::optional<const std::vector<int64_t>>&,
-                            c10::optional<torch::ScalarType>,
-                            c10::optional<fvdb::TorchDeviceOrString>,
-                            bool, bool>(
-*/
+    m.def("load", &fvdb::load, py::arg("path"), py::arg("grid_id") = py::none(),
+          py::arg("device") = "cpu", py::arg("verbose") = false);
+    m.def("save", &fvdb::save, py::arg("path"), py::arg("grid_batch"), py::arg("data") = py::none(),
+          py::arg("names") = py::none(), py::arg("compressed") = false, py::arg("verbose") = false);
+
+    /*
+              py::overload_cast<const std::vector<int64_t>&,
+                                c10::optional<const std::vector<int64_t>>&,
+                                c10::optional<torch::ScalarType>,
+                                c10::optional<fvdb::TorchDeviceOrString>,
+                                bool, bool>(
+    */
     bind_jt_build_functions(m);
 
-
     // Global config
     py::class_<fvdb::Config>(m, "config")
-      .def_property_static(
+        .def_property_static(
             "enable_ultra_sparse_acceleration",
             [](py::object) { return fvdb::Config::global().ultraSparseAccelerationEnabled(); },
-            [](py::object, bool enabled) { fvdb::Config::global().setUltraSparseAcceleration(enabled); })
-      .def_property_static(
+            [](py::object, bool enabled) {
+                fvdb::Config::global().setUltraSparseAcceleration(enabled);
+            })
+        .def_property_static(
             "pedantic_error_checking",
             [](py::object) { return fvdb::Config::global().pendanticErrorCheckingEnabled(); },
-            [](py::object, bool enabled) { fvdb::Config::global().setPendanticErrorChecking(enabled); });
+            [](py::object, bool enabled) {
+                fvdb::Config::global().setPendanticErrorChecking(enabled);
+            });
 
     py::enum_<fvdb::ConvPackBackend>(m, "ConvPackBackend")
         .value("GATHER_SCATTER", fvdb::ConvPackBackend::GATHER_SCATTER)
@@ -181,8 +163,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
         .export_values();
 
     py::class_<fvdb::SparseConvPackInfo>(m, "SparseConvPackInfo")
-        .def(py::init<fvdb::Vec3iOrScalar, fvdb::Vec3iOrScalar, fvdb::GridBatch, torch::optional<fvdb::GridBatch>>(),
-             py::arg("kernel_size"), py::arg("stride"), py::arg("source_grid"), py::arg("target_grid"))
+        .def(py::init<fvdb::Vec3iOrScalar, fvdb::Vec3iOrScalar, fvdb::GridBatch,
+                      torch::optional<fvdb::GridBatch>>(),
+             py::arg("kernel_size"), py::arg("stride"), py::arg("source_grid"),
+             py::arg("target_grid"))
         .def_property_readonly("neighborhood_map", &fvdb::SparseConvPackInfo::neighborMap)
         .def_property_readonly("neighborhood_sizes", &fvdb::SparseConvPackInfo::neighborSizes)
         .def_property_readonly("use_me", &fvdb::SparseConvPackInfo::useME)
@@ -194,36 +178,46 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 
         .def_property_readonly("block_kernel_ranges", &fvdb::SparseConvPackInfo::blockKernelRanges)
         .def_property_readonly("block_kernel_in_idx", &fvdb::SparseConvPackInfo::blockKernelInIdx)
-        .def_property_readonly("block_kernel_rel_out_idx", &fvdb::SparseConvPackInfo::blockKernelRelOutIdx)
+        .def_property_readonly("block_kernel_rel_out_idx",
+                               &fvdb::SparseConvPackInfo::blockKernelRelOutIdx)
 
         .def_property_readonly("use_tf32", &fvdb::SparseConvPackInfo::useTF32)
         .def_property_readonly("out_in_map_bwd", &fvdb::SparseConvPackInfo::outInMapBwd)
         .def_property_readonly("reorder_loc_bwd", &fvdb::SparseConvPackInfo::reorderLocBwd)
         .def_property_readonly("sorted_mask_bwd_w", &fvdb::SparseConvPackInfo::sortedMaskBwdW)
         .def_property_readonly("sorted_mask_bwd_d", &fvdb::SparseConvPackInfo::sortedMaskBwdD)
-        .def_property_readonly("reorder_out_in_map_bwd", &fvdb::SparseConvPackInfo::reorderOutInMapBwd)
+        .def_property_readonly("reorder_out_in_map_bwd",
+                               &fvdb::SparseConvPackInfo::reorderOutInMapBwd)
         .def_property_readonly("halo_index_buffer", &fvdb::SparseConvPackInfo::haloIndexBuffer)
         .def_property_readonly("output_index_buffer", &fvdb::SparseConvPackInfo::outputIndexBuffer)
         .def_property_readonly("stride",
-            [](const fvdb::SparseConvPackInfo& self) {nanovdb::math::Coord stride = self.stride().value();
-                                                      return py::make_tuple(stride.x(), stride.y(), stride.z());})
+                               [](const fvdb::SparseConvPackInfo &self) {
+                                   nanovdb::math::Coord stride = self.stride().value();
+                                   return py::make_tuple(stride.x(), stride.y(), stride.z());
+                               })
         .def_property_readonly("kernel_size",
-            [](const fvdb::SparseConvPackInfo& self) {nanovdb::math::Coord kernel_size = self.kernelSize().value();
-                                                      return py::make_tuple(kernel_size.x(), kernel_size.y(), kernel_size.z());})
+                               [](const fvdb::SparseConvPackInfo &self) {
+                                   nanovdb::math::Coord kernel_size = self.kernelSize().value();
+                                   return py::make_tuple(kernel_size.x(), kernel_size.y(),
+                                                         kernel_size.z());
+                               })
         .def_property_readonly("source_grid", &fvdb::SparseConvPackInfo::sourceGrid)
         .def_property_readonly("target_grid", &fvdb::SparseConvPackInfo::targetGrid)
-        .def("build_gather_scatter", &fvdb::SparseConvPackInfo::buildGatherScatter, py::arg("use_me") = false)
+        .def("build_gather_scatter", &fvdb::SparseConvPackInfo::buildGatherScatter,
+             py::arg("use_me") = false)
         .def("build_implicit_gemm", &fvdb::SparseConvPackInfo::buildImplicitGEMM,
-            py::arg("sorted") = false, py::arg("split_mask_num") = 1, py::arg("training") = false, py::arg("split_mask_num_bwd") = 1, py::arg("use_tf32") = false)
+             py::arg("sorted") = false, py::arg("split_mask_num") = 1, py::arg("training") = false,
+             py::arg("split_mask_num_bwd") = 1, py::arg("use_tf32") = false)
         .def("build_cutlass", &fvdb::SparseConvPackInfo::buildCutlass, py::arg("benchmark") = false)
         .def("build_lggs", &fvdb::SparseConvPackInfo::buildLGGS)
-        .def("sparse_conv_3d", &fvdb::SparseConvPackInfo::sparseConv3d,
-	     "Sparse 3d convolution", py::arg("input"), py::arg("weights"), py::arg("backend") = fvdb::ConvPackBackend::GATHER_SCATTER)
+        .def("sparse_conv_3d", &fvdb::SparseConvPackInfo::sparseConv3d, "Sparse 3d convolution",
+             py::arg("input"), py::arg("weights"),
+             py::arg("backend") = fvdb::ConvPackBackend::GATHER_SCATTER)
         .def("sparse_transpose_conv_3d", &fvdb::SparseConvPackInfo::sparseTransposeConv3d,
-	     "Sparse 3d convolution transpose", py::arg("input"), py::arg("weights"), py::arg("backend") = fvdb::ConvPackBackend::GATHER_SCATTER);
+             "Sparse 3d convolution transpose", py::arg("input"), py::arg("weights"),
+             py::arg("backend") = fvdb::ConvPackBackend::GATHER_SCATTER);
 }
 
-
 TORCH_LIBRARY(my_classes, m) {
     m.class_<fvdb::GridBatch>("GridBatch");
     m.class_<fvdb::JaggedTensor>("JaggedTensor");
diff --git a/fvdb/src/python/GridBatchBinding.cpp b/fvdb/src/python/GridBatchBinding.cpp
index 0c7b8a2370..9333104ad3 100644
--- a/fvdb/src/python/GridBatchBinding.cpp
+++ b/fvdb/src/python/GridBatchBinding.cpp
@@ -1,95 +1,138 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
+#include "TypeCasters.h"
 
-#include "FVDB.h"
+#include <FVDB.h>
+
+#include <torch/extension.h>
 
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
-#include "TypeCasters.h"
-
-
-void bind_grid_batch(py::module& m) {
+void
+bind_grid_batch(py::module &m) {
     py::class_<fvdb::GridBatch>(m, "GridBatch", "A batch of sparse VDB grids.")
-        .def(py::init<fvdb::TorchDeviceOrString, bool>(), py::arg("device") = "cpu", py::arg("mutable") = false)
+        .def(py::init<fvdb::TorchDeviceOrString, bool>(), py::arg("device") = "cpu",
+             py::arg("mutable") = false)
 
         // Properties
         .def_property_readonly("total_voxels", &fvdb::GridBatch::total_voxels,
-            "The total number of voxels indexed by this batch of grids.")
+                               "The total number of voxels indexed by this batch of grids.")
         .def_property_readonly("total_enabled_voxels", &fvdb::GridBatch::total_enabled_voxels,
-            "The total number of enabled voxels indexed by this batch of grids.")
+                               "The total number of enabled voxels indexed by this batch of grids.")
         .def_property_readonly("total_bbox", &fvdb::GridBatch::total_bbox, R"_FVDB_(
             A tensor, total_bbox, of shape [2, 3] where total_bbox = `[[bmin_i, bmin_j, bmin_z=k],
               [bmax_i, bmax_j, bmax_k]]` is the bounding box such that `bmin <= ijk < bmax` for all voxels
               ijk in the batch.
         )_FVDB_")
-        .def_property_readonly_static("max_grids_per_batch", [](py::object) -> int64_t { return fvdb::GridBatch::MAX_GRIDS_PER_BATCH; },
+        .def_property_readonly_static(
+            "max_grids_per_batch",
+            [](py::object) -> int64_t { return fvdb::GridBatch::MAX_GRIDS_PER_BATCH; },
             "The maximum number of grids that can be stored in a single batch.")
-        .def_property_readonly("mutable", &fvdb::GridBatch::is_mutable, "Whether the grid is mutable.")
-        .def_property_readonly("device", &fvdb::GridBatch::device, "The device on which this grid is stored.")
-        .def_property_readonly("enabled_mask", &fvdb::GridBatch::enabled_mask,
+        .def_property_readonly("mutable", &fvdb::GridBatch::is_mutable,
+                               "Whether the grid is mutable.")
+        .def_property_readonly("device", &fvdb::GridBatch::device,
+                               "The device on which this grid is stored.")
+        .def_property_readonly(
+            "enabled_mask", &fvdb::GridBatch::enabled_mask,
             "A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is enabled or not.")
-        .def_property_readonly("disabled_mask", &fvdb::GridBatch::disabled_mask,
+        .def_property_readonly(
+            "disabled_mask", &fvdb::GridBatch::disabled_mask,
             "A boolean JaggedTensor of shape [B, -1] indicating whether each voxel in the grid is disabled or not.")
-        .def_property_readonly("grid_count", &fvdb::GridBatch::grid_count, "The number of grids indexed by this batch.")
-        .def_property_readonly("num_voxels", &fvdb::GridBatch::num_voxels,
+        .def_property_readonly("grid_count", &fvdb::GridBatch::grid_count,
+                               "The number of grids indexed by this batch.")
+        .def_property_readonly(
+            "num_voxels", &fvdb::GridBatch::num_voxels,
             "An integer tensor containing the number of voxels per grid indexed by this batch.")
         .def_property_readonly("cum_voxels", &fvdb::GridBatch::cum_voxels, R"_FVDB_(
             An integer tensor containing the cumulative number of voxels indexed by the grids in this batch.
               i.e. `[nvox_0, nvox_0+nvox_1, nvox_0+nvox_1+nvox_2, ...]`
         )_FVDB_")
-        .def_property_readonly("num_enabled_voxels", &fvdb::GridBatch::num_enabled_voxels,
+        .def_property_readonly(
+            "num_enabled_voxels", &fvdb::GridBatch::num_enabled_voxels,
             "An integer tensor containing the number of enabled voxels per grid indexed by this batch. If this grid is not mutable, this will be the same as num_voxels.")
-        .def_property_readonly("cum_enabled_voxels", &fvdb::GridBatch::cum_enabled_voxels,
+        .def_property_readonly(
+            "cum_enabled_voxels", &fvdb::GridBatch::cum_enabled_voxels,
             "An integer tensor containing the cumulative number of voxels enabled in each grid in this batch. i.e. `[nvox_0, nvox_0+nvox_1, nvox_0+nvox_1+nvox_2, ...]`")
-        .def_property_readonly("origins", [](const fvdb::GridBatch& self) { return self.origins(torch::kFloat32); },
-                               "A [num_grids, 3] tensor of world space origins for each grid in this batch.")
-        .def_property_readonly("voxel_sizes", [](const fvdb::GridBatch& self) { return self.voxel_sizes(torch::kFloat32); },
-                               "A [num_grids, 3] tensor of voxel sizes for each grid in this batch.")
+        .def_property_readonly(
+            "origins", [](const fvdb::GridBatch &self) { return self.origins(torch::kFloat32); },
+            "A [num_grids, 3] tensor of world space origins for each grid in this batch.")
+        .def_property_readonly(
+            "voxel_sizes",
+            [](const fvdb::GridBatch &self) { return self.voxel_sizes(torch::kFloat32); },
+            "A [num_grids, 3] tensor of voxel sizes for each grid in this batch.")
         .def_property_readonly("total_bytes", &fvdb::GridBatch::total_bytes,
                                "The total number of bytes used by this batch of grids.")
-        .def_property_readonly("num_bytes", &fvdb::GridBatch::num_bytes,
-                               "A [num_grids] tensor of the number of bytes used by each grid in this batch.")
+        .def_property_readonly(
+            "num_bytes", &fvdb::GridBatch::num_bytes,
+            "A [num_grids] tensor of the number of bytes used by each grid in this batch.")
         .def_property_readonly("total_leaf_nodes", &fvdb::GridBatch::total_leaf_nodes,
                                "The total number of leaf nodes used by this batch of grids.")
-        .def_property_readonly("num_leaf_nodes", &fvdb::GridBatch::num_leaf_nodes,
-                               "A [num_grids] tensor of the number of leaf nodes used by each grid in this batch.")
-        .def_property_readonly("jidx", &fvdb::GridBatch::jidx,
-                               "A [total_voxels,] tensor of the jagged index of each voxel in this batch.")
-        .def_property_readonly("joffsets", &fvdb::GridBatch::joffsets,
-                               "A [num_grids+1,] tensor of the jagged offsets of each grid in this batch.")
-        .def_property_readonly("ijk", &fvdb::GridBatch::ijk,
-                               "A [num_grids, -1, 3] JaggedTensor of the ijk coordinates of each voxel in this batch.")
-        .def_property_readonly("ijk_enabled", &fvdb::GridBatch::ijk_enabled,
-                               "A [num_grids, -1, 3] JaggedTensor of the ijk coordinates of each enabled voxel in this batch.")
-        .def_property_readonly("viz_edge_network", [](const fvdb::GridBatch& self) { return self.viz_edge_network(false); },
-                               "A pair of JaggedTensors `(gv, ge)` of shape [num_grids, -1, 3] and [num_grids, -1, 2] where `gv` are the corner positions of each voxel and `ge` are edge indices indexing into `gv`. This property is useful for visualizing the grid.")
-        .def_property_readonly("grid_to_world_matrices", [](const fvdb::GridBatch& self) { return self.grid_to_world_matrices(torch::kFloat32); },
-                               "A [num_grids, 4, 4] tensor of the grid to world transformation matrices for each grid in this batch.")
-        .def_property_readonly("world_to_grid_matrices", [](const fvdb::GridBatch& self) { return self.world_to_grid_matrices(torch::kFloat32); },
-                               "A [num_grids, 4, 4] tensor of the world to grid transformation matrices for each grid in this batch.")
-        .def_property_readonly("bbox", &fvdb::GridBatch::bbox,
-                               "A [num_grids, 2, 3] tensor of the bounding box of each grid in this batch where `bbox[i, 0]` is the minimimum ijk coordinate of the i^th grid, and `bbox[i, 1]` is the maximum ijk coordinate.")
-        .def_property_readonly("dual_bbox", &fvdb::GridBatch::dual_bbox,
-                               "A [num_grids, 2, 3] tensor of the bounding box of the dual of each grid in this batch where bbox[i, 0] is the minimimum ijk coordinate of the i^th dual grid, and bbox[i, 1] is the maximum ijk coordinate.")
+        .def_property_readonly(
+            "num_leaf_nodes", &fvdb::GridBatch::num_leaf_nodes,
+            "A [num_grids] tensor of the number of leaf nodes used by each grid in this batch.")
+        .def_property_readonly(
+            "jidx", &fvdb::GridBatch::jidx,
+            "A [total_voxels,] tensor of the jagged index of each voxel in this batch.")
+        .def_property_readonly(
+            "joffsets", &fvdb::GridBatch::joffsets,
+            "A [num_grids+1,] tensor of the jagged offsets of each grid in this batch.")
+        .def_property_readonly(
+            "ijk", &fvdb::GridBatch::ijk,
+            "A [num_grids, -1, 3] JaggedTensor of the ijk coordinates of each voxel in this batch.")
+        .def_property_readonly(
+            "ijk_enabled", &fvdb::GridBatch::ijk_enabled,
+            "A [num_grids, -1, 3] JaggedTensor of the ijk coordinates of each enabled voxel in this batch.")
+        .def_property_readonly(
+            "viz_edge_network",
+            [](const fvdb::GridBatch &self) { return self.viz_edge_network(false); },
+            "A pair of JaggedTensors `(gv, ge)` of shape [num_grids, -1, 3] and [num_grids, -1, 2] where `gv` are the corner positions of each voxel and `ge` are edge indices indexing into `gv`. This property is useful for visualizing the grid.")
+        .def_property_readonly(
+            "grid_to_world_matrices",
+            [](const fvdb::GridBatch &self) {
+                return self.grid_to_world_matrices(torch::kFloat32);
+            },
+            "A [num_grids, 4, 4] tensor of the grid to world transformation matrices for each grid in this batch.")
+        .def_property_readonly(
+            "world_to_grid_matrices",
+            [](const fvdb::GridBatch &self) {
+                return self.world_to_grid_matrices(torch::kFloat32);
+            },
+            "A [num_grids, 4, 4] tensor of the world to grid transformation matrices for each grid in this batch.")
+        .def_property_readonly(
+            "bbox", &fvdb::GridBatch::bbox,
+            "A [num_grids, 2, 3] tensor of the bounding box of each grid in this batch where `bbox[i, 0]` is the minimimum ijk coordinate of the i^th grid, and `bbox[i, 1]` is the maximum ijk coordinate.")
+        .def_property_readonly(
+            "dual_bbox", &fvdb::GridBatch::dual_bbox,
+            "A [num_grids, 2, 3] tensor of the bounding box of the dual of each grid in this batch where bbox[i, 0] is the minimimum ijk coordinate of the i^th dual grid, and bbox[i, 1] is the maximum ijk coordinate.")
         .def_property_readonly("address", &fvdb::GridBatch::address,
                                "The memory address of the underlying C++ GridBatch object.")
 
         // Read a property for a single grid in the batch
-        .def("voxel_size_at", [](const fvdb::GridBatch& self, int64_t bi) { return self.voxel_size_at(bi, torch::kFloat32); },
-             "Get the voxel size of the bi^th grid in the batch.")
-        .def("origin_at", [](const fvdb::GridBatch& self, int64_t bi) { return self.origin_at(bi, torch::kFloat32); },
-             "Get the origin of the bi^th grid in the batch.")
+        .def(
+            "voxel_size_at",
+            [](const fvdb::GridBatch &self, int64_t bi) {
+                return self.voxel_size_at(bi, torch::kFloat32);
+            },
+            "Get the voxel size of the bi^th grid in the batch.")
+        .def(
+            "origin_at",
+            [](const fvdb::GridBatch &self, int64_t bi) {
+                return self.origin_at(bi, torch::kFloat32);
+            },
+            "Get the origin of the bi^th grid in the batch.")
         .def("num_voxels_at", &fvdb::GridBatch::num_voxels_at,
              "Get the number of voxels in the bi^th grid in the batch.")
-        .def("cum_voxels_at", &fvdb::GridBatch::cum_voxels_at,
-             "Get the cumulative number of voxels in the bi^th grid in the batch. i.e. `nvox_0+nvox_1+...+nvox_i`")
-        .def("num_enabled_voxels_at", &fvdb::GridBatch::num_enabled_voxels_at,
-             "Get the number of enabled voxels in the bi^th grid in the batch. If this grid isn't mutable, this returns the same value as num_voxels_at.")
-        .def("cum_enabled_voxels_at", &fvdb::GridBatch::cum_enabled_voxels_at,
-             "Get the cumulative number of enabled voxels in the bi^th grid in the batch. i.e. `nvox_0+nvox_1+...+nvox_i`. If this grid isn't mutable, this returns the same value as cum_voxels_at.")
+        .def(
+            "cum_voxels_at", &fvdb::GridBatch::cum_voxels_at,
+            "Get the cumulative number of voxels in the bi^th grid in the batch. i.e. `nvox_0+nvox_1+...+nvox_i`")
+        .def(
+            "num_enabled_voxels_at", &fvdb::GridBatch::num_enabled_voxels_at,
+            "Get the number of enabled voxels in the bi^th grid in the batch. If this grid isn't mutable, this returns the same value as num_voxels_at.")
+        .def(
+            "cum_enabled_voxels_at", &fvdb::GridBatch::cum_enabled_voxels_at,
+            "Get the cumulative number of enabled voxels in the bi^th grid in the batch. i.e. `nvox_0+nvox_1+...+nvox_i`. If this grid isn't mutable, this returns the same value as cum_voxels_at.")
         .def("bbox_at", &fvdb::GridBatch::bbox_at, R"_FVDB_(
             Get the bounding box (in voxel coordinates) of the bi^th grid in the batch.
 
@@ -100,12 +143,15 @@ void bind_grid_batch(py::module& m) {
                 bbox (torch.Tensor): A tensor, bbox, of shape [2, 3] where bbox = [[bmin_i, bmin_j, bmin_z=k],
                   [bmax_i, bmax_j, bmax_k]] is the bi^th bounding box such that bmin <= ijk < bmax for all voxels
                   ijk in the bi^th grid.
-        )_FVDB_", py::arg("bi"))
-        .def("dual_bbox_at", &fvdb::GridBatch::dual_bbox_at,
-             "Get the bounding box (in voxel coordinates) of the dual of the bi^th grid in the batch.")
+        )_FVDB_",
+             py::arg("bi"))
+        .def(
+            "dual_bbox_at", &fvdb::GridBatch::dual_bbox_at,
+            "Get the bounding box (in voxel coordinates) of the dual of the bi^th grid in the batch.")
 
         // Create a jagged tensor with the same offsets as this grid batch
-        .def("jagged_like", &fvdb::GridBatch::jagged_like, py::arg("data"), py::arg("ignore_disabled") = true,
+        .def("jagged_like", &fvdb::GridBatch::jagged_like, py::arg("data"),
+             py::arg("ignore_disabled") = true,
              R"_FVDB_(
             Create a JaggedTensor with the same offsets as this grid batch.
 
@@ -124,9 +170,9 @@ void bind_grid_batch(py::module& m) {
              "Whether this grid batch is contiguous.")
 
         // Array indexing
-        .def("__getitem__", [](const fvdb::GridBatch& self, int64_t bi) {
-            return self.index(bi);
-        }, R"_FVDB_(
+        .def(
+            "__getitem__", [](const fvdb::GridBatch &self, int64_t bi) { return self.index(bi); },
+            R"_FVDB_(
             Get the i^th grid in the batch.
 
             Args:
@@ -134,14 +180,17 @@ void bind_grid_batch(py::module& m) {
 
             Returns:
                 grid (Grid): The i^th grid in the batch.)_FVDB_")
-        .def("__getitem__", [](const fvdb::GridBatch& self, pybind11::slice slice) {
-            ssize_t start, stop, step, len;
-            if (!slice.compute(self.grid_count(), &start, &stop, &step, &len)) {
-                TORCH_CHECK_INDEX(false, "Invalid slice ", py::repr(slice).cast<std::string>());
-            }
-            TORCH_CHECK_INDEX(step != 0, "step cannot be 0");
-            return self.index(start, stop, step);
-        }, R"_FVDB_(
+        .def(
+            "__getitem__",
+            [](const fvdb::GridBatch &self, pybind11::slice slice) {
+                ssize_t start, stop, step, len;
+                if (!slice.compute(self.grid_count(), &start, &stop, &step, &len)) {
+                    TORCH_CHECK_INDEX(false, "Invalid slice ", py::repr(slice).cast<std::string>());
+                }
+                TORCH_CHECK_INDEX(step != 0, "step cannot be 0");
+                return self.index(start, stop, step);
+            },
+            R"_FVDB_(
             Get a slice of grids in the batch.
 
             Args:
@@ -149,9 +198,10 @@ void bind_grid_batch(py::module& m) {
 
             Returns:
                 grids (GridBatch): A GridBatch containing the sliced grids.)_FVDB_")
-        .def("__getitem__", [](const fvdb::GridBatch& self, std::vector<bool> bi) {
-            return self.index(bi);
-        }, R"_FVDB_(
+        .def(
+            "__getitem__",
+            [](const fvdb::GridBatch &self, std::vector<bool> bi) { return self.index(bi); },
+            R"_FVDB_(
             Get a slice of grids in the batch from a boolean mask.
 
             Args:
@@ -159,10 +209,10 @@ void bind_grid_batch(py::module& m) {
 
             Returns:
                 grids (GridBatch): A GridBatch containing the sliced grids.)_FVDB_")
-        .def("__getitem__", [](const fvdb::GridBatch& self, std::vector<int64_t> bi) {
-            return self.index(bi);
-        },
-        R"_FVDB_(
+        .def(
+            "__getitem__",
+            [](const fvdb::GridBatch &self, std::vector<int64_t> bi) { return self.index(bi); },
+            R"_FVDB_(
             Get a slice of grids in the batch from a list of indices.
 
             Args:
@@ -170,10 +220,10 @@ void bind_grid_batch(py::module& m) {
 
             Returns:
                 grids (GridBatch): A GridBatch containing the sliced grids.)_FVDB_")
-        .def("__getitem__", [](const fvdb::GridBatch& self, torch::Tensor bi) {
-            return self.index(bi);
-        },
-        R"_FVDB_(
+        .def(
+            "__getitem__",
+            [](const fvdb::GridBatch &self, torch::Tensor bi) { return self.index(bi); },
+            R"_FVDB_(
             Get a slice of grids in the batch from a tensor of indices.
 
             Args:
@@ -183,8 +233,7 @@ void bind_grid_batch(py::module& m) {
                 grids (GridBatch): A GridBatch containing the sliced grids.)_FVDB_")
 
         // length
-        .def("__len__", &fvdb::GridBatch::grid_count,
-             "The number of grids in this batch.")
+        .def("__len__", &fvdb::GridBatch::grid_count, "The number of grids in this batch.")
 
         // Setting transformation
         .def("set_global_origin", &fvdb::GridBatch::set_global_origin, py::arg("origin"),
@@ -193,7 +242,8 @@ void bind_grid_batch(py::module& m) {
 
             Args:
                 origin (list of floats): The new global origin of this batch of grids.)_FVDB_")
-        .def("set_global_voxel_size", &fvdb::GridBatch::set_global_voxel_size, py::arg("voxel_size"),
+        .def("set_global_voxel_size", &fvdb::GridBatch::set_global_voxel_size,
+             py::arg("voxel_size"),
              R"_FVDB_(
             Set the voxel size of all grids in this batch.
 
@@ -201,12 +251,10 @@ void bind_grid_batch(py::module& m) {
                 voxel_size (list of floats): The new global voxel size of this batch of grids.)_FVDB_")
 
         // Grid construction
-        .def("set_from_mesh", &fvdb::GridBatch::set_from_mesh,
-             py::arg("mesh_vertices"),
-             py::arg("mesh_faces"),
-             py::arg("voxel_sizes") = 1.0,
+        .def("set_from_mesh", &fvdb::GridBatch::set_from_mesh, py::arg("mesh_vertices"),
+             py::arg("mesh_faces"), py::arg("voxel_sizes") = 1.0,
              py::arg("origins") = torch::zeros(3, torch::kInt32),
-              R"_FVDB_(
+             R"_FVDB_(
             Set the voxels in this grid batch to those which intersect a given triangle mesh
 
             Args:
@@ -214,13 +262,11 @@ void bind_grid_batch(py::module& m) {
                 mesh_faces (JaggedTensor): A JaggedTensor of shape [num_grids, -1, 3] of integer indexes into `mesh_vertices` specifying the faces of each mesh.
                 voxel_sizes (float, list, tensor): Either a float or triple specifyng the voxel size of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the voxel size for each grid.
                 origins (float, list, tensor): Either a float or triple specifyng the world space origin of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the world space origin for each grid.)_FVDB_")
-        .def("set_from_points", &fvdb::GridBatch::set_from_points,
-                py::arg("points"),
-                py::arg("pad_min") = torch::zeros(3, torch::kInt32),
-                py::arg("pad_max") = torch::zeros(3, torch::kInt32),
-                py::arg("voxel_sizes") = 1.0,
-                py::arg("origins") = torch::zeros(3, torch::kInt32),
-                R"_FVDB_(
+        .def("set_from_points", &fvdb::GridBatch::set_from_points, py::arg("points"),
+             py::arg("pad_min") = torch::zeros(3, torch::kInt32),
+             py::arg("pad_max") = torch::zeros(3, torch::kInt32), py::arg("voxel_sizes") = 1.0,
+             py::arg("origins") = torch::zeros(3, torch::kInt32),
+             R"_FVDB_(
             Set the voxels in this grid batch to those which contain a point in a given point cloud (with optional padding)
 
             Args:
@@ -230,14 +276,11 @@ void bind_grid_batch(py::module& m) {
                 mesh_faces (JaggedTensor): A JaggedTensor of shape [num_grids, -1, 3] of integer indexes into `mesh_vertices` specifying the faces of each mesh.
                 voxel_sizes (float, list, tensor): Either a float or triple specifyng the voxel size of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the voxel size for each grid.
                 origins (float, list, tensor): Either a float or triple specifyng the world space origin of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the world space origin for each grid.)_FVDB_")
-        .def("set_from_dense_grid", &fvdb::GridBatch::set_from_dense_grid,
-                py::arg("num_grids"),
-                py::arg("dense_dims"),
-                py::arg("ijk_min") = torch::zeros(3, torch::kInt32),
-                py::arg("voxel_sizes") = 1.0,
-                py::arg("origins") = torch::zeros(3),
-                py::arg("mask") = nullptr,
-                R"_FVDB_(
+        .def("set_from_dense_grid", &fvdb::GridBatch::set_from_dense_grid, py::arg("num_grids"),
+             py::arg("dense_dims"), py::arg("ijk_min") = torch::zeros(3, torch::kInt32),
+             py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros(3),
+             py::arg("mask") = nullptr,
+             R"_FVDB_(
                     Set the voxels in this grid batch to a dense grid with shape [num_grids, width, height, depth], otpionally masking out certain voxels
 
                     Args:
@@ -248,13 +291,11 @@ void bind_grid_batch(py::module& m) {
                         origins (float, list, tensor): Either a float or triple specifyng the world space origin of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the world space origin for each grid.
                         mask (torch.Tensor): A tensor of shape [num_grids, width, height, depth] of booleans indicating which voxels to include/exclude.
                 )_FVDB_")
-        .def("set_from_ijk", &fvdb::GridBatch::set_from_ijk,
-                py::arg("ijk"),
-                py::arg("pad_min") = torch::zeros(3, torch::kInt32),
-                py::arg("pad_max") = torch::zeros(3, torch::kInt32),
-                py::arg("voxel_sizes") = 1.0,
-                py::arg("origins") = torch::zeros(3),
-                R"_FVDB_(
+        .def("set_from_ijk", &fvdb::GridBatch::set_from_ijk, py::arg("ijk"),
+             py::arg("pad_min") = torch::zeros(3, torch::kInt32),
+             py::arg("pad_max") = torch::zeros(3, torch::kInt32), py::arg("voxel_sizes") = 1.0,
+             py::arg("origins") = torch::zeros(3),
+             R"_FVDB_(
                     Set the voxels in this grid batch to those specified by a given set of ijk coordinates (with optional padding)
 
                     Args:
@@ -264,9 +305,10 @@ void bind_grid_batch(py::module& m) {
                         voxel_sizes (float, list, tensor): Either a float or triple specifyng the voxel size of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the voxel size for each grid.
                         origins (float, list, tensor): Either a float or triple specifyng the world space origin of all the grids in the batch or a tensor of shape [num_grids, 3] specifying the world space origin for each grid.
                 )_FVDB_")
-        .def("set_from_nearest_voxels_to_points", &fvdb::GridBatch::set_from_nearest_voxels_to_points,
-                py::arg("points"), py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros(3),
-                R"_FVDB_(
+        .def("set_from_nearest_voxels_to_points",
+             &fvdb::GridBatch::set_from_nearest_voxels_to_points, py::arg("points"),
+             py::arg("voxel_sizes") = 1.0, py::arg("origins") = torch::zeros(3),
+             R"_FVDB_(
                     Set the voxels in this grid batch to the nearest voxel to each point in a given point cloud
 
                     Args:
@@ -276,10 +318,8 @@ void bind_grid_batch(py::module& m) {
                 )_FVDB_")
 
         // Interface with dense grids
-        .def("read_into_dense", &fvdb::GridBatch::read_into_dense,
-             py::arg("sparse_data"),
-             py::arg("min_coord") = nullptr,
-             py::arg("grid_size") = nullptr,
+        .def("read_into_dense", &fvdb::GridBatch::read_into_dense, py::arg("sparse_data"),
+             py::arg("min_coord") = nullptr, py::arg("grid_size") = nullptr,
              R"_FVDB_(
                 Read the data in a tensor indexed by this batch of grids into a dense tensor, setting non indexed values to zero.
 
@@ -291,10 +331,9 @@ void bind_grid_batch(py::module& m) {
                 Returns:
                     dense_data (torch.Tensor): A tensor of shape `[num_grids, width, height, depth, *]` of values indexed by this grid batch.
              )_FVDB_")
-        .def("read_from_dense", &fvdb::GridBatch::read_from_dense,
-                py::arg("dense_data"),
-                py::arg("dense_origins") = torch::zeros(3, torch::kInt32),
-                R"_FVDB_(
+        .def("read_from_dense", &fvdb::GridBatch::read_from_dense, py::arg("dense_data"),
+             py::arg("dense_origins") = torch::zeros(3, torch::kInt32),
+             R"_FVDB_(
                     Read the data in a dense tensor into a JaggedTensor indexed by this batch of grids. Non-indexed values are ignored.
 
                     Args:
@@ -305,11 +344,9 @@ void bind_grid_batch(py::module& m) {
                         sparse_data (JaggedTensor): A JaggedTensor of shape `[num_grids, -1, *]` of values indexed by this grid batch.
                 )_FVDB_")
 
-        .def("fill_to_grid", &fvdb::GridBatch::fill_to_grid,
-                py::arg("features"),
-                py::arg("other_grid"),
-                py::arg("default_value") = 0.0,
-                R"_FVDB_(
+        .def("fill_to_grid", &fvdb::GridBatch::fill_to_grid, py::arg("features"),
+             py::arg("other_grid"), py::arg("default_value") = 0.0,
+             R"_FVDB_(
                     Given a GridBatch and features associated with it, return a JaggedTensor representing features for this batch of grid.
                     Fill any voxels not in the GridBatch with the default value.
 
@@ -334,7 +371,8 @@ void bind_grid_batch(py::module& m) {
                 Returns:
                     dual_grid (GridBatch): A GridBatch representing the dual of this grid batch.
              )_FVDB_")
-        .def("coarsened_grid", &fvdb::GridBatch::coarsened_grid, py::arg("coarsening_factor"), R"_FVDB_(
+        .def("coarsened_grid", &fvdb::GridBatch::coarsened_grid, py::arg("coarsening_factor"),
+             R"_FVDB_(
                 Return a batch of grids representing the coarsened version of this batch.
                 Each voxel `[i, j, k]` in this grid batch maps to voxel `[i / branchFactor, j / branchFactor, k / branchFactor]` in the coarse batch.
 
@@ -344,7 +382,8 @@ void bind_grid_batch(py::module& m) {
                 Returns:
                     coarsened_grid (GridBatch): A GridBatch representing the coarsened version of this grid batch.
                 )_FVDB_")
-        .def("subdivided_grid", &fvdb::GridBatch::subdivided_grid, py::arg("subdiv_factor"), py::arg("mask") = nullptr, R"_FVDB_(
+        .def("subdivided_grid", &fvdb::GridBatch::subdivided_grid, py::arg("subdiv_factor"),
+             py::arg("mask") = nullptr, R"_FVDB_(
                 Subdivide the grid batch into a finer grid batch.
                 Each voxel [i, j, k] in this grid batch maps to voxels `[i * subdivFactor, j * subdivFactor, k * subdivFactor]` in the fine batch.
 
@@ -355,7 +394,8 @@ void bind_grid_batch(py::module& m) {
                 Returns:
                     subdivided_grid (GridBatch): A GridBatch representing the subdivided version of this grid batch.
                 )_FVDB_")
-        .def("clipped_grid", &fvdb::GridBatch::clipped_grid, py::arg("ijk_min"), py::arg("ijk_max"), R"_FVDB_(
+        .def("clipped_grid", &fvdb::GridBatch::clipped_grid, py::arg("ijk_min"), py::arg("ijk_max"),
+             R"_FVDB_(
                 Return a batch of grids representing the clipped version of this batch.
                 Each voxel `[i, j, k]` in the input batch is included in the output if it lies within `ijk_min` and `ijk_max`.
 
@@ -392,7 +432,7 @@ void bind_grid_batch(py::module& m) {
                 clipped_features (JaggedTensor): a JaggedTensor of shape `[B, -1, *]` of clipped data.
                 clipped_grid (GridBatch): the clipped grid batch.
             )_FVDB_",
-            py::arg("features"), py::arg("ijk_min"), py::arg("ijk_max"))
+             py::arg("features"), py::arg("ijk_min"), py::arg("ijk_max"))
 
         // Upsampling and pooling
         .def("max_pool", &fvdb::GridBatch::max_pool, R"_FVDB_(
@@ -409,7 +449,8 @@ void bind_grid_batch(py::module& m) {
                 coarse_data (JaggedTensor): a JaggedTensor of shape `[B, -1, *]` of downsampled data.
                 coarse_grid (GridBatch): the downsampled grid batch.
             )_FVDB_",
-            py::arg("pool_factor"), py::arg("data"), py::arg("stride") = 0, py::arg("coarse_grid") = nullptr)
+             py::arg("pool_factor"), py::arg("data"), py::arg("stride") = 0,
+             py::arg("coarse_grid") = nullptr)
 
         .def("avg_pool", &fvdb::GridBatch::avg_pool, R"_FVDB_(
             Downsample this batch of grids using average pooling.
@@ -425,10 +466,11 @@ void bind_grid_batch(py::module& m) {
                 coarse_data (JaggedTensor): a JaggedTensor of shape `[B, -1, *]` of downsampled data.
                 coarse_grid (GridBatch): the downsampled grid batch.
         )_FVDB_",
-        py::arg("pool_factor"), py::arg("data"), py::arg("stride") = 0, py::arg("coarse_grid") = nullptr)
+             py::arg("pool_factor"), py::arg("data"), py::arg("stride") = 0,
+             py::arg("coarse_grid") = nullptr)
 
-        .def("subdivide", &fvdb::GridBatch::subdivide,
-             py::arg("subdiv_factor"), py::arg("data"), py::arg("mask") = nullptr, py::arg("fine_grid") = nullptr, R"_FVDB_(
+        .def("subdivide", &fvdb::GridBatch::subdivide, py::arg("subdiv_factor"), py::arg("data"),
+             py::arg("mask") = nullptr, py::arg("fine_grid") = nullptr, R"_FVDB_(
                 Subdivide the grid batch and associated data tensor into a finer GridBatch and data tensor using nearest neighbor sampling.
                 Each voxel [i, j, k] in this grid batch maps to voxels `[i * subdivFactor, j * subdivFactor, k * subdivFactor]` in the fine batch.
                 Each data value in the subdividided data tensor inherits its parent value
@@ -462,8 +504,8 @@ void bind_grid_batch(py::module& m) {
         )_FVDB_")
 
         // Grid intersects/contains objects
-        .def("points_in_active_voxel", &fvdb::GridBatch::points_in_active_voxel,
-             py::arg("xyz"), py::arg("ignore_disabled") = false, R"_FVDB_(
+        .def("points_in_active_voxel", &fvdb::GridBatch::points_in_active_voxel, py::arg("xyz"),
+             py::arg("ignore_disabled") = false, R"_FVDB_(
             Given a set of points, return a JaggedTensor of booleans indicating which points are in active voxels.
 
             Args:
@@ -473,8 +515,8 @@ void bind_grid_batch(py::module& m) {
             Returns:
                 points_in_active_voxel (JaggedTensor): A JaggedTensor of shape `[num_grids, -1]` of booleans indicating which points are in active voxels.
         )_FVDB_")
-        .def("coords_in_active_voxel", &fvdb::GridBatch::coords_in_active_voxel,
-             py::arg("ijk"), py::arg("ignore_disabled") = false, R"_FVDB_(
+        .def("coords_in_active_voxel", &fvdb::GridBatch::coords_in_active_voxel, py::arg("ijk"),
+             py::arg("ignore_disabled") = false, R"_FVDB_(
             Given a set of ijk coordinates, return a JaggedTensor of booleans indicating which coordinates are active in this gridbatch
 
             Args:
@@ -485,8 +527,7 @@ void bind_grid_batch(py::module& m) {
                 coords_in_active_voxel (JaggedTensor): A JaggedTensor of shape `[num_grids, -1]` of booleans indicating which coordinates are in the grid.
         )_FVDB_")
         .def("cubes_intersect_grid", &fvdb::GridBatch::cubes_intersect_grid,
-             py::arg("cube_centers"),
-             py::arg("cube_min") = 0.0, py::arg("cube_max") = 0.0,
+             py::arg("cube_centers"), py::arg("cube_min") = 0.0, py::arg("cube_max") = 0.0,
              py::arg("ignore_disabled") = false, R"_FVDB_(
             Given a set of cube centers and extents, return a JaggedTensor of booleans indicating whether cubes intersect active voxels.
 
@@ -499,8 +540,7 @@ void bind_grid_batch(py::module& m) {
             Returns:
                 cubes_intersect_grid (JaggedTensor): A JaggedTensor of shape `[num_grids, -1]` of booleans indicating whether cubes intersect active voxels.
         )_FVDB_")
-        .def("cubes_in_grid", &fvdb::GridBatch::cubes_in_grid,
-             py::arg("cube_centers"),
+        .def("cubes_in_grid", &fvdb::GridBatch::cubes_in_grid, py::arg("cube_centers"),
              py::arg("cube_min") = 0.0, py::arg("cube_max") = 0.0,
              py::arg("ignore_disabled") = false, R"_FVDB_(
             Given a set of cube centers and extents, return a JaggedTensor of booleans indicating whether cubes fully reside in active voxels.
@@ -516,44 +556,45 @@ void bind_grid_batch(py::module& m) {
         )_FVDB_")
 
         // Indexing functions
-        .def("ijk_to_index", &fvdb::GridBatch::ijk_to_index, py::arg("ijk"), py::arg("cumulative") = false)
-        .def("ijk_to_inv_index", &fvdb::GridBatch::ijk_to_inv_index, py::arg("ijk"), py::arg("cumulative") = false)
-        .def("neighbor_indexes", &fvdb::GridBatch::neighbor_indexes,
-                py::arg("ijk"), py::arg("extent"), py::arg("bitshift") = 0)
+        .def("ijk_to_index", &fvdb::GridBatch::ijk_to_index, py::arg("ijk"),
+             py::arg("cumulative") = false)
+        .def("ijk_to_inv_index", &fvdb::GridBatch::ijk_to_inv_index, py::arg("ijk"),
+             py::arg("cumulative") = false)
+        .def("neighbor_indexes", &fvdb::GridBatch::neighbor_indexes, py::arg("ijk"),
+             py::arg("extent"), py::arg("bitshift") = 0)
 
         // Ray tracing
-        .def("voxels_along_rays", &fvdb::GridBatch::voxels_along_rays,
-                py::arg("ray_origins"), py::arg("ray_directions"), py::arg("max_voxels"), py::arg("eps") = 0.0, py::arg("return_ijk") = true, py::arg("cumulative") = false)
-        .def("segments_along_rays", &fvdb::GridBatch::segments_along_rays,
-                py::arg("ray_origins"), py::arg("ray_directions"), py::arg("max_segments"), py::arg("eps") = 0.0, py::arg("ignore_masked") = false)
-        .def("uniform_ray_samples", &fvdb::GridBatch::uniform_ray_samples,
-                py::arg("ray_origins"), py::arg("ray_directions"),
-                py::arg("t_min"), py::arg("t_max"), py::arg("step_size"),
-                py::arg("cone_angle") = 0.0,
-                py::arg("include_end_segments") = true,
-                py::arg("return_midpoints") = false,
-                py::arg("eps") = 0.0)
+        .def("voxels_along_rays", &fvdb::GridBatch::voxels_along_rays, py::arg("ray_origins"),
+             py::arg("ray_directions"), py::arg("max_voxels"), py::arg("eps") = 0.0,
+             py::arg("return_ijk") = true, py::arg("cumulative") = false)
+        .def("segments_along_rays", &fvdb::GridBatch::segments_along_rays, py::arg("ray_origins"),
+             py::arg("ray_directions"), py::arg("max_segments"), py::arg("eps") = 0.0,
+             py::arg("ignore_masked") = false)
+        .def("uniform_ray_samples", &fvdb::GridBatch::uniform_ray_samples, py::arg("ray_origins"),
+             py::arg("ray_directions"), py::arg("t_min"), py::arg("t_max"), py::arg("step_size"),
+             py::arg("cone_angle") = 0.0, py::arg("include_end_segments") = true,
+             py::arg("return_midpoints") = false, py::arg("eps") = 0.0)
         .def("ray_implicit_intersection", &fvdb::GridBatch::ray_implicit_intersection,
-                py::arg("ray_origins"), py::arg("ray_directions"),
-                py::arg("grid_scalars"), py::arg("eps") = 0.0)
+             py::arg("ray_origins"), py::arg("ray_directions"), py::arg("grid_scalars"),
+             py::arg("eps") = 0.0)
 
         // Sparse grid operations
-        .def("splat_trilinear", &fvdb::GridBatch::splat_trilinear,
-                py::arg("points"), py::arg("points_data"))
-        .def("splat_bezier", &fvdb::GridBatch::splat_bezier,
-                py::arg("points"), py::arg("points_data"))
-        .def("sample_trilinear", &fvdb::GridBatch::sample_trilinear,
-                py::arg("points"), py::arg("voxel_data"))
+        .def("splat_trilinear", &fvdb::GridBatch::splat_trilinear, py::arg("points"),
+             py::arg("points_data"))
+        .def("splat_bezier", &fvdb::GridBatch::splat_bezier, py::arg("points"),
+             py::arg("points_data"))
+        .def("sample_trilinear", &fvdb::GridBatch::sample_trilinear, py::arg("points"),
+             py::arg("voxel_data"))
         .def("sample_trilinear_with_grad", &fvdb::GridBatch::sample_trilinear_with_grad,
-                py::arg("points"), py::arg("voxel_data"))
-        .def("sample_bezier", &fvdb::GridBatch::sample_bezier,
-                py::arg("points"), py::arg("voxel_data"))
+             py::arg("points"), py::arg("voxel_data"))
+        .def("sample_bezier", &fvdb::GridBatch::sample_bezier, py::arg("points"),
+             py::arg("voxel_data"))
         .def("sample_bezier_with_grad", &fvdb::GridBatch::sample_bezier_with_grad,
-                py::arg("points"), py::arg("voxel_data"))
+             py::arg("points"), py::arg("voxel_data"))
 
         // Marching cubes
-        .def("marching_cubes", &fvdb::GridBatch::marching_cubes,
-                py::arg("field"), py::arg("level") = 0.0)
+        .def("marching_cubes", &fvdb::GridBatch::marching_cubes, py::arg("field"),
+             py::arg("level") = 0.0)
 
         // Convolution
         .def("sparse_conv_halo", &fvdb::GridBatch::sparse_conv_halo, R"_FVDB_(
@@ -567,31 +608,35 @@ void bind_grid_batch(py::module& m) {
             Returns:
                 out (JaggedTensor): a JaggedTensor of shape `[B, -1, *]` of convolved data.
         )_FVDB_",
-        py::arg("input"), py::arg("weight"), py::arg("variant") = 8)
+             py::arg("input"), py::arg("weight"), py::arg("variant") = 8)
 
         // Coordinate transform
         .def("grid_to_world", &fvdb::GridBatch::grid_to_world, py::arg("ijk"))
         .def("world_to_grid", &fvdb::GridBatch::world_to_grid, py::arg("xyz"))
 
         // To device
-        .def("to", py::overload_cast<fvdb::TorchDeviceOrString>(&fvdb::GridBatch::to, py::const_), py::arg("device"))
-        .def("to", py::overload_cast<const torch::Tensor&>(&fvdb::GridBatch::to, py::const_), py::arg("to_tensor"))
-        .def("to", py::overload_cast<const fvdb::JaggedTensor&>(&fvdb::GridBatch::to, py::const_), py::arg("to_jtensor"))
-        .def("to", py::overload_cast<const fvdb::GridBatch&>(&fvdb::GridBatch::to, py::const_), py::arg("to_grid"))
+        .def("to", py::overload_cast<fvdb::TorchDeviceOrString>(&fvdb::GridBatch::to, py::const_),
+             py::arg("device"))
+        .def("to", py::overload_cast<const torch::Tensor &>(&fvdb::GridBatch::to, py::const_),
+             py::arg("to_tensor"))
+        .def("to", py::overload_cast<const fvdb::JaggedTensor &>(&fvdb::GridBatch::to, py::const_),
+             py::arg("to_jtensor"))
+        .def("to", py::overload_cast<const fvdb::GridBatch &>(&fvdb::GridBatch::to, py::const_),
+             py::arg("to_grid"))
 
         // .def("clone", &fvdb::GridBatch::clone) // TODO: We totally want this
 
-        .def("sparse_conv_kernel_map", [](fvdb::GridBatch& self, fvdb::Vec3iOrScalar kernelSize, fvdb::Vec3iOrScalar stride,
-                                          torch::optional<fvdb::GridBatch> targetGrid) {
-            auto ret = fvdb::SparseConvPackInfo(kernelSize, stride, self, targetGrid);
-            return std::make_tuple(ret, ret.targetGrid());
-        }, py::arg("kernel_size"), py::arg("stride"), py::arg("target_grid") = nullptr)
+        .def(
+            "sparse_conv_kernel_map",
+            [](fvdb::GridBatch &self, fvdb::Vec3iOrScalar kernelSize, fvdb::Vec3iOrScalar stride,
+               torch::optional<fvdb::GridBatch> targetGrid) {
+                auto ret = fvdb::SparseConvPackInfo(kernelSize, stride, self, targetGrid);
+                return std::make_tuple(ret, ret.targetGrid());
+            },
+            py::arg("kernel_size"), py::arg("stride"), py::arg("target_grid") = nullptr)
         .def(py::pickle(
-            [](const fvdb::GridBatch& batchHdl) {
+            [](const fvdb::GridBatch &batchHdl) {
                 return batchHdl.serialize().to(batchHdl.device());
             },
-            [](torch::Tensor t) {
-                return fvdb::GridBatch::deserialize(t.cpu()).to(t.device());
-            }
-        ));
+            [](torch::Tensor t) { return fvdb::GridBatch::deserialize(t.cpu()).to(t.device()); }));
 }
diff --git a/fvdb/src/python/JaggedTensorBinding.cpp b/fvdb/src/python/JaggedTensorBinding.cpp
index e6b3de5d7d..cb85c97444 100644
--- a/fvdb/src/python/JaggedTensorBinding.cpp
+++ b/fvdb/src/python/JaggedTensorBinding.cpp
@@ -1,18 +1,17 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
+#include "TypeCasters.h"
 
-#include "FVDB.h"
+#include <FVDB.h>
+
+#include <torch/extension.h>
 
 #include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 
-#include "TypeCasters.h"
-
-
-void bind_jagged_tensor(py::module& m) {
-
+void
+bind_jagged_tensor(py::module &m) {
     py::class_<fvdb::JaggedTensor>(m, "JaggedTensor")
         .def(py::init<std::vector<std::vector<torch::Tensor>>&>(), py::arg("tensor_list"))
         .def(py::init<std::vector<torch::Tensor>&>(), py::arg("tensor_list"), R"_FVDB_(
@@ -290,7 +289,7 @@ void bind_jagged_tensor(py::module& m) {
                     const torch::Tensor data = THPVariable_Unpack(t[0].ptr());
                     const torch::Tensor jidx = THPVariable_Unpack(t[1].ptr()).to(fvdb::JIdxScalarType);
                     const torch::Tensor jlidx = torch::empty({0, 1}, torch::TensorOptions().dtype(fvdb::JLIdxScalarType).device(data.device()));
-		            const int64_t batchSize = py::cast<int>(t[2]);
+                    const int64_t batchSize = py::cast<int>(t[2]);
                     return fvdb::JaggedTensor::from_data_indices_and_list_ids(data, jidx, jlidx, batchSize);
                 }
 
@@ -325,5 +324,4 @@ void bind_jagged_tensor(py::module& m) {
                 }
             }
         ));
-
 }
\ No newline at end of file
diff --git a/fvdb/src/python/TypeCasters.h b/fvdb/src/python/TypeCasters.h
index 070d610bea..ae5f3ebeb7 100644
--- a/fvdb/src/python/TypeCasters.h
+++ b/fvdb/src/python/TypeCasters.h
@@ -1,62 +1,69 @@
 // Copyright Contributors to the OpenVDB Project
 // SPDX-License-Identifier: MPL-2.0
 //
-#include <torch/extension.h>
-#include <pybind11/numpy.h>
-#include <pybind11/stl.h>
+#ifndef FVDB_PYTHON_TYPECASTERS_H
+#define FVDB_PYTHON_TYPECASTERS_H
 
-#include "Types.h"
-#include "JaggedTensor.h"
+#include <JaggedTensor.h>
+#include <Types.h>
 
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+#include <torch/extension.h>
 
 namespace pybind11 {
 namespace detail {
 
 const static inline pybind11::module TORCH_MODULE = py::module_::import("torch");
 
-
 template <> struct type_caster<fvdb::JaggedTensor> : public type_caster_base<fvdb::JaggedTensor> {
     using base = type_caster_base<fvdb::JaggedTensor>;
-public:
+
+  public:
     fvdb::JaggedTensor jag_value;
 
-    bool load(handle src, bool convert) {
+    bool
+    load(handle src, bool convert) {
         if (THPVariable_Check(src.ptr())) {
             // TODO: (@fwilliams) Might need to reinterpret steal here?
             torch::Tensor data = THPVariable_Unpack(src.ptr());
-            jag_value = fvdb::JaggedTensor({data});
-            value = &jag_value;
+            jag_value          = fvdb::JaggedTensor({ data });
+            value              = &jag_value;
             return true;
         } else {
             return base::load(src, convert);
         }
     }
 
-    static handle cast(const fvdb::JaggedTensor& src, return_value_policy policy, handle parent) {
+    static handle
+    cast(const fvdb::JaggedTensor &src, return_value_policy policy, handle parent) {
         return base::cast(src, policy, parent);
     }
 };
 
-
 // Already defined in upstream pytorch: https://github.com/pytorch/pytorch/pull/126865
 // (starting from version 2.4)
-#if (!defined(TORCH_VERSION_MAJOR) || (TORCH_VERSION_MAJOR < 2) || (TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR < 4))
+#if (!defined(TORCH_VERSION_MAJOR) || (TORCH_VERSION_MAJOR < 2) || \
+     (TORCH_VERSION_MAJOR >= 2 && TORCH_VERSION_MINOR < 4))
 template <> struct type_caster<torch::ScalarType> : public type_caster_base<torch::ScalarType> {
     using base = type_caster_base<torch::ScalarType>;
-public:
+
+  public:
     torch::ScalarType st_value;
 
-    bool load(handle src, bool convert) {
+    bool
+    load(handle src, bool convert) {
         if (THPDtype_Check(src.ptr())) {
-            st_value = reinterpret_cast<THPDtype*>(src.ptr())->scalar_type;
-            value = &st_value;
+            st_value = reinterpret_cast<THPDtype *>(src.ptr())->scalar_type;
+            value    = &st_value;
             return true;
         } else {
             return base::load(src, convert);
         }
     }
 
-    static handle cast(const at::ScalarType& src, return_value_policy policy, handle parent) {
+    static handle
+    cast(const at::ScalarType &src, return_value_policy policy, handle parent) {
         auto result = TORCH_MODULE.attr(fvdb::detail::TorchScalarTypeToStr(src).c_str());
         Py_INCREF(result.ptr());
         return result;
@@ -64,51 +71,54 @@ template <> struct type_caster<torch::ScalarType> : public type_caster_base<torc
 };
 #endif
 
-
-template <> struct type_caster<fvdb::TorchDeviceOrString> : public type_caster_base<fvdb::TorchDeviceOrString> {
+template <>
+struct type_caster<fvdb::TorchDeviceOrString> : public type_caster_base<fvdb::TorchDeviceOrString> {
     using base = type_caster_base<fvdb::TorchDeviceOrString>;
-public:
-    fvdb::TorchDeviceOrString dev_value;
 
-    bool load(handle src, bool convert) {
+  public:
+    fvdb::TorchDeviceOrString dev_value;
 
+    bool
+    load(handle src, bool convert) {
         std::string deviceString;
         try {
             deviceString = src.cast<std::string>();
-        } catch (pybind11::cast_error& e) {
+        } catch (pybind11::cast_error &e) {
             if (THPDevice_Check(src.ptr())) {
-                dev_value = reinterpret_cast<THPDevice*>(src.ptr())->device;
-                value = &dev_value;
+                dev_value = reinterpret_cast<THPDevice *>(src.ptr())->device;
+                value     = &dev_value;
                 return true;
             } else {
                 return false;
             }
         }
         dev_value = deviceString;
-        value = &dev_value;
+        value     = &dev_value;
         return true;
     }
 };
 
-
-template <> struct type_caster<fvdb::JaggedTensorIndex> : public type_caster_base<fvdb::JaggedTensorIndex> {
+template <>
+struct type_caster<fvdb::JaggedTensorIndex> : public type_caster_base<fvdb::JaggedTensorIndex> {
     fvdb::JaggedTensorIndex idx_value = c10::nullopt;
 
-    bool load(handle src, bool convert) {
-
+    bool
+    load(handle src, bool convert) {
         if (py::isinstance<py::ellipsis>(src)) {
             idx_value = at::indexing::Ellipsis;
-            value = &idx_value;
+            value     = &idx_value;
             return true;
         }
         if (py::isinstance<py::slice>(src)) {
             py::ssize_t start, stop, step;
-            py::slice slice = src.cast<py::slice>();
+            py::slice   slice = src.cast<py::slice>();
             PySlice_Unpack(slice.ptr(), &start, &stop, &step);
 
-            // FIXME: (@fwilliams) -- This is a bit weird. Ideally we want the same behavior as Pyslice_ComputeEx
-            //                        but we don't know the size apriori here. In any case, we only use this for
-            //                        JaggedTensor which doesn't support negative step sizes anyway.
+            // FIXME: (@fwilliams) -- This is a bit weird. Ideally we want the same behavior as
+            // Pyslice_ComputeEx
+            //                        but we don't know the size apriori here. In any case, we only
+            //                        use this for JaggedTensor which doesn't support negative step
+            //                        sizes anyway.
             if (step < 0) {
                 if (start >= PY_SSIZE_T_MAX) {
                     start = -1;
@@ -119,58 +129,63 @@ template <> struct type_caster<fvdb::JaggedTensorIndex> : public type_caster_bas
                 }
             }
             idx_value = at::indexing::Slice(start, stop, step);
-            value = &idx_value;
+            value     = &idx_value;
             return true;
         }
         if (py::isinstance<py::int_>(src)) {
             idx_value = src.cast<int64_t>();
-            value = &idx_value;
+            value     = &idx_value;
             return true;
         }
         if (py::isinstance<py::none>(src)) {
             idx_value = at::indexing::None;
-            value = &idx_value;
+            value     = &idx_value;
             return true;
         }
         if (pybind11::isinstance<fvdb::JaggedTensor>(src)) {
             idx_value = src.cast<fvdb::JaggedTensor>();
-            value = &idx_value;
+            value     = &idx_value;
             return true;
         }
         return false;
     }
 };
 
-
-template <> struct type_caster<fvdb::StringOrListOfStrings> : public type_caster_base<fvdb::StringOrListOfStrings> {
+template <>
+struct type_caster<fvdb::StringOrListOfStrings>
+    : public type_caster_base<fvdb::StringOrListOfStrings> {
     using base = type_caster_base<fvdb::StringOrListOfStrings>;
-public:
+
+  public:
     fvdb::StringOrListOfStrings strlist_value;
 
-    bool load(handle src, bool convert) {
+    bool
+    load(handle src, bool convert) {
         try {
             std::string deviceString = src.cast<std::string>();
-            strlist_value = fvdb::StringOrListOfStrings(deviceString);
-            value = &strlist_value;
+            strlist_value            = fvdb::StringOrListOfStrings(deviceString);
+            value                    = &strlist_value;
             return true;
-        } catch (pybind11::cast_error& e) {
+        } catch (pybind11::cast_error &e) {
             std::vector<std::string> stringList = src.cast<std::vector<std::string>>();
-            strlist_value = fvdb::StringOrListOfStrings(stringList);
-            value = &strlist_value;
+            strlist_value                       = fvdb::StringOrListOfStrings(stringList);
+            value                               = &strlist_value;
             return true;
         }
         return false;
     }
 };
 
-
-template <> struct type_caster<fvdb::NanoVDBFileGridIdentifier> : public type_caster_base<fvdb::NanoVDBFileGridIdentifier> {
+template <>
+struct type_caster<fvdb::NanoVDBFileGridIdentifier>
+    : public type_caster_base<fvdb::NanoVDBFileGridIdentifier> {
     using base = type_caster_base<fvdb::NanoVDBFileGridIdentifier>;
-public:
-    fvdb::NanoVDBFileGridIdentifier id_value;
 
-    bool load(handle src, bool convert) {
+  public:
+    fvdb::NanoVDBFileGridIdentifier id_value;
 
+    bool
+    load(handle src, bool convert) {
         if (src.is_none()) {
             value = &id_value;
             return true;
@@ -178,25 +193,21 @@ template <> struct type_caster<fvdb::NanoVDBFileGridIdentifier> : public type_ca
 
         if (base::load(src, convert)) {
             return true;
-        }
-        else if (py::isinstance<py::int_>(src)) {
+        } else if (py::isinstance<py::int_>(src)) {
             id_value = src.cast<uint64_t>();
-            value = &id_value;
+            value    = &id_value;
             return true;
-        }
-        else if (py::isinstance<py::str>(src)) {
+        } else if (py::isinstance<py::str>(src)) {
             id_value = src.cast<std::string>();
-            value = &id_value;
+            value    = &id_value;
             return true;
-        }
-        else if (py::isinstance<py::list>(src)) {
-            try{
+        } else if (py::isinstance<py::list>(src)) {
+            try {
                 id_value = src.cast<std::vector<uint64_t>>();
-            }
-            catch (pybind11::cast_error& e) {
-                try{
+            } catch (pybind11::cast_error &e) {
+                try {
                     id_value = src.cast<std::vector<std::string>>();
-                } catch (pybind11::cast_error& e) {
+                } catch (pybind11::cast_error &e) {
                     return false;
                 }
             }
@@ -208,14 +219,13 @@ template <> struct type_caster<fvdb::NanoVDBFileGridIdentifier> : public type_ca
     }
 };
 
-
 template <typename CoordRetT>
-bool loadCoordType(handle src, bool convert, CoordRetT& outValue, void** outPtr) {
-
+bool
+loadCoordType(handle src, bool convert, CoordRetT &outValue, void **outPtr) {
     std::vector<typename CoordRetT::ValueType> coordVec;
     try {
         coordVec = src.cast<std::vector<typename CoordRetT::ValueType>>();
-    } catch (pybind11::cast_error& e) {
+    } catch (pybind11::cast_error &e) {
         typename CoordRetT::ValueType coordScalar;
         try {
             if constexpr (CoordRetT::SupportsScalarCast) {
@@ -223,11 +233,11 @@ bool loadCoordType(handle src, bool convert, CoordRetT& outValue, void** outPtr)
             } else {
                 throw pybind11::cast_error();
             }
-        } catch (pybind11::cast_error& e) {
+        } catch (pybind11::cast_error &e) {
             if (THPVariable_Check(src.ptr())) {
                 torch::Tensor data = THPVariable_Unpack(src.ptr());
-                outValue = CoordRetT(data);
-                *outPtr = &outValue;
+                outValue           = CoordRetT(data);
+                *outPtr            = &outValue;
                 return true;
             } else {
                 return false;
@@ -235,18 +245,18 @@ bool loadCoordType(handle src, bool convert, CoordRetT& outValue, void** outPtr)
         }
         if constexpr (CoordRetT::SupportsScalarCast) {
             outValue = CoordRetT(coordScalar);
-            *outPtr = &outValue;
+            *outPtr  = &outValue;
             return true;
         }
     }
     outValue = CoordRetT(coordVec);
-    *outPtr = &outValue;
+    *outPtr  = &outValue;
     return true;
 }
 
-
 template <typename CoordBatchRetT>
-bool loadVecBatch(handle src, bool convert, CoordBatchRetT& outValue, void** outPtr) {
+bool
+loadVecBatch(handle src, bool convert, CoordBatchRetT &outValue, void **outPtr) {
     using ValType = typename CoordBatchRetT::ValueType;
 
     std::vector<ValType> coordVec;
@@ -256,11 +266,11 @@ bool loadVecBatch(handle src, bool convert, CoordBatchRetT& outValue, void** out
         } else {
             throw pybind11::cast_error();
         }
-    } catch (pybind11::cast_error& e) {
+    } catch (pybind11::cast_error &e) {
         std::vector<std::vector<ValType>> coordVecVec;
         try {
             coordVecVec = src.cast<std::vector<std::vector<ValType>>>();
-        } catch (pybind11::cast_error& e) {
+        } catch (pybind11::cast_error &e) {
             ValType val;
             try {
                 if constexpr (CoordBatchRetT::SupportsScalarCast) {
@@ -268,11 +278,11 @@ bool loadVecBatch(handle src, bool convert, CoordBatchRetT& outValue, void** out
                 } else {
                     throw pybind11::cast_error();
                 }
-            } catch (pybind11::cast_error& e) {
+            } catch (pybind11::cast_error &e) {
                 if (THPVariable_Check(src.ptr())) {
                     torch::Tensor data = THPVariable_Unpack(src.ptr());
-                    outValue = CoordBatchRetT(data);
-                    *outPtr = &outValue;
+                    outValue           = CoordBatchRetT(data);
+                    *outPtr            = &outValue;
                     return true;
                 } else {
                     return false;
@@ -280,37 +290,37 @@ bool loadVecBatch(handle src, bool convert, CoordBatchRetT& outValue, void** out
             }
             if constexpr (CoordBatchRetT::SupportsScalarCast) {
                 outValue = CoordBatchRetT(val);
-                *outPtr = &outValue;
+                *outPtr  = &outValue;
                 return true;
             }
         }
 
         outValue = CoordBatchRetT(coordVecVec);
-        *outPtr = &outValue;
+        *outPtr  = &outValue;
         return true;
     }
     if constexpr (CoordBatchRetT::SupportsBroadcast) {
         outValue = CoordBatchRetT(coordVec);
-        *outPtr = &outValue;
+        *outPtr  = &outValue;
         return true;
     }
 }
 
-
-#define INBOUND_TYPE_CASTER(DataType, loadFunction) \
+#define INBOUND_TYPE_CASTER(DataType, loadFunction)                                \
     template <> struct type_caster<DataType> : public type_caster_base<DataType> { \
         using base = type_caster_base<DataType>;                                   \
-    public:                                                                        \
+                                                                                   \
+      public:                                                                      \
         DataType coord_value;                                                      \
-        bool load(handle src, bool convert) {                                      \
-            if (!loadFunction<DataType>(src, convert, coord_value, &value)){       \
+        bool                                                                       \
+        load(handle src, bool convert) {                                           \
+            if (!loadFunction<DataType>(src, convert, coord_value, &value)) {      \
                 return base::load(src, convert);                                   \
             }                                                                      \
             return true;                                                           \
         }                                                                          \
     };
 
-
 INBOUND_TYPE_CASTER(fvdb::Vec3d, loadCoordType)
 INBOUND_TYPE_CASTER(fvdb::Vec3dOrScalar, loadCoordType)
 INBOUND_TYPE_CASTER(fvdb::Vec3i, loadCoordType)
@@ -320,6 +330,7 @@ INBOUND_TYPE_CASTER(fvdb::Vec3dBatchOrScalar, loadVecBatch)
 INBOUND_TYPE_CASTER(fvdb::Vec3dBatch, loadVecBatch)
 INBOUND_TYPE_CASTER(fvdb::Vec3iBatch, loadVecBatch)
 
-
 } // namespace detail
 } // namespace pybind11
+
+#endif // FVDB_PYTHON_TYPECASTERS_H
\ No newline at end of file
diff --git a/fvdb/tests/benchmark/comparative_benchmark.py b/fvdb/tests/benchmark/comparative_benchmark.py
index f0a1ac7b29..7b9255cd1a 100644
--- a/fvdb/tests/benchmark/comparative_benchmark.py
+++ b/fvdb/tests/benchmark/comparative_benchmark.py
@@ -18,9 +18,15 @@
 
 from fvdb_benchmark.dataset import CoordsDataset
 from fvdb_benchmark.configs import all_configs, BaseConfig
-from fvdb_benchmark.utils import current_gpu_memory_usage, create_l2_cache, flush_l2_cache, \
-    df_to_table, decode_range_name, encode_range_name, is_range_name
-
+from fvdb_benchmark.utils import (
+    current_gpu_memory_usage,
+    create_l2_cache,
+    flush_l2_cache,
+    df_to_table,
+    decode_range_name,
+    encode_range_name,
+    is_range_name,
+)
 
 
 @torch.no_grad()
@@ -111,7 +117,7 @@ def main(args) -> None:
                 if args.detail:
 
                     with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
-                        with record_function(encode_range_name("main", '-', {})):
+                        with record_function(encode_range_name("main", "-", {})):
                             baseline_out = model(baseline_input, **aux_dict)
 
                     all_events = prof.key_averages()
@@ -129,20 +135,17 @@ def main(args) -> None:
 
                 config.post_measure(baseline_input, baseline_out)
 
-            my_time = np.mean(
-                [start_events[j].elapsed_time(end_events[j]) for j in range(args.repeats)]
-            )
+            my_time = np.mean([start_events[j].elapsed_time(end_events[j]) for j in range(args.repeats)])
             my_memory = current_gpu_memory_usage() - start_gpu_memory_usage
 
-            baseline_gross_data.append({
-                "key": ijk_name,
-                "time": my_time,
-                # "memory": my_memory,      # We shouldn't use nvidia-smi to obtain memory usage.
-            })
-            baseline_detailed_data.append({
-                "key": ijk_name,
-                **{k: np.mean(v) for k, v in detailed_record.items()}
-            })
+            baseline_gross_data.append(
+                {
+                    "key": ijk_name,
+                    "time": my_time,
+                    # "memory": my_memory,      # We shouldn't use nvidia-smi to obtain memory usage.
+                }
+            )
+            baseline_detailed_data.append({"key": ijk_name, **{k: np.mean(v) for k, v in detailed_record.items()}})
 
         gross_data[baseline] = pd.DataFrame(baseline_gross_data)
         gross_data[baseline].set_index("key", inplace=True)
@@ -155,27 +158,22 @@ def main(args) -> None:
 
     for col_name in gross_col_names:
         rich.print(f"------ Comparing {col_name} ------")
-        full_dataframes = pd.concat(
-            [df[col_name].rename(baseline) for baseline, df in gross_data.items()],
-            axis=1
-        )
-        full_dataframes.loc['mean'] = full_dataframes.mean()
+        full_dataframes = pd.concat([df[col_name].rename(baseline) for baseline, df in gross_data.items()], axis=1)
+        full_dataframes.loc["mean"] = full_dataframes.mean()
         rich.print(df_to_table(full_dataframes))
 
     if args.detail:
         # Get all columns
-        detailed_col_all_names = [
-            set(df.columns) for df in detailed_data.values()
-        ]
+        detailed_col_all_names = [set(df.columns) for df in detailed_data.values()]
         detailed_col_names = list(set.union(*detailed_col_all_names))
 
         for col_name in detailed_col_names:
             rich.print(f"------ Comparing {col_name} ------")
             full_dataframes = pd.concat(
                 [df[col_name].rename(baseline) for baseline, df in detailed_data.items() if col_name in df.columns],
-                axis=1
+                axis=1,
             )
-            full_dataframes.loc['mean'] = full_dataframes.mean()
+            full_dataframes.loc["mean"] = full_dataframes.mean()
             rich.print(df_to_table(full_dataframes))
 
 
@@ -189,7 +187,7 @@ def main(args) -> None:
     options = parser.parse_args()
 
     if not options.no_clb:
-        os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
+        os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
     create_l2_cache()
 
     main(options)
diff --git a/fvdb/tests/benchmark/conftest.py b/fvdb/tests/benchmark/conftest.py
index 4674af51a8..5ee6495469 100644
--- a/fvdb/tests/benchmark/conftest.py
+++ b/fvdb/tests/benchmark/conftest.py
@@ -13,6 +13,7 @@
 
 # pytest_plugins = ["pytest_fvdbench.plugin"]
 
+
 def _make_runner(self, function_to_benchmark, args, kwargs):
     create_l2_cache()
 
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/configs.py b/fvdb/tests/benchmark/fvdb_benchmark/configs.py
index 24663b32ad..56379433ec 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/configs.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/configs.py
@@ -14,7 +14,7 @@
 class BaseConfig(ABC):
     in_channels: int
     dataset_paths: list[str]
-    baselines: list[str] = ['ts', 'fvdb::igemm_mode1', 'fvdb::cutlass']
+    baselines: list[str] = ["ts", "fvdb::igemm_mode1", "fvdb::cutlass"]
 
     def get_aux_inputs(self, vdb_tensor: VDBTensor) -> dict:
         return {}
@@ -34,10 +34,16 @@ def _make_model(self, baseline: str) -> nn.Module:
 
     def to_baseline_input(self, vdb_tensor: VDBTensor, baseline: str):
         if baseline == "ts":
-            return SparseTensor(vdb_tensor.feature.jdata.clone(), torch.cat([
-                vdb_tensor.grid.ijk.jidx[:, None].int(),
-                vdb_tensor.grid.ijk.jdata.int(),
-            ], dim=1))
+            return SparseTensor(
+                vdb_tensor.feature.jdata.clone(),
+                torch.cat(
+                    [
+                        vdb_tensor.grid.ijk.jidx[:, None].int(),
+                        vdb_tensor.grid.ijk.jdata.int(),
+                    ],
+                    dim=1,
+                ),
+            )
         elif baseline.startswith("fvdb"):
             return vdb_tensor
         else:
@@ -56,11 +62,11 @@ class GridBuildingConfig(BaseConfig):
     in_channels: int = 32
 
     def get_aux_inputs(self, vdb_tensor: VDBTensor) -> dict:
-        if vdb_tensor.device != torch.device('cuda'):
-            vdb_tensor = vdb_tensor.to('cuda')
+        if vdb_tensor.device != torch.device("cuda"):
+            vdb_tensor = vdb_tensor.to("cuda")
         coord = vdb_tensor.grid.ijk
         feats = vdb_tensor.feature.jdata
-        return {'coords': coord, 'feats': feats}
+        return {"coords": coord, "feats": feats}
 
     def to_baseline_input(self, vdb_tensor: VDBTensor, baseline: str):
         return None
@@ -68,7 +74,7 @@ def to_baseline_input(self, vdb_tensor: VDBTensor, baseline: str):
 
 class XCubeConfig(BaseConfig):
     in_channels: int = 32
-    dataset_paths: list[str] = [str(Path(__file__).parent.parent / 'data' / 'kc-256')]
+    dataset_paths: list[str] = [str(Path(__file__).parent.parent / "data" / "kc-256")]
 
     def get_aux_inputs(self, vdb_tensor: VDBTensor) -> dict:
         coord = vdb_tensor.grid.ijk.jdata
@@ -78,52 +84,41 @@ def get_aux_inputs(self, vdb_tensor: VDBTensor) -> dict:
             coords = torch.unique(coords, dim=0)
             gt_coords[layer_idx] = coords
 
-        return {'gt_coords': {
-            layer_idx: fvdb.sparse_grid_from_ijk(coords)
-            for (layer_idx, coords) in gt_coords.items()
-        }}
+        return {
+            "gt_coords": {layer_idx: fvdb.sparse_grid_from_ijk(coords) for (layer_idx, coords) in gt_coords.items()}
+        }
 
     def _make_model(self, baseline: str) -> nn.Module:
         from fvdb_benchmark.model.xcube import XCubeVAE
-        return XCubeVAE(
-            in_channels=self.in_channels,
-            backend=baseline,
-            num_blocks=4,
-            f_maps=32,
-            order='cr'
-        )
+
+        return XCubeVAE(in_channels=self.in_channels, backend=baseline, num_blocks=4, f_maps=32, order="cr")
 
 
 class KITTISegmentationConfig(BaseConfig):
     in_channels: int = 4
-    dataset_paths: list[str] = [str(Path(__file__).parent.parent / 'data' / 'kitti')]
+    dataset_paths: list[str] = [str(Path(__file__).parent.parent / "data" / "kitti")]
 
     def _make_model(self, baseline: str) -> nn.Module:
         from fvdb_benchmark.model.minkunet import MinkUNet
-        return MinkUNet(
-            backend=baseline,
-            in_channels=self.in_channels,
-            cr=1.0, num_classes=19
-        )
+
+        return MinkUNet(backend=baseline, in_channels=self.in_channels, cr=1.0, num_classes=19)
 
 
 class UpDownConfig(BaseConfig):
     in_channels: int = 64
-    dataset_paths: list[str] = [str(Path(__file__).parent.parent / 'data' / 'kitti')]
-    baselines: list[str] = ['ts', 'fvdb::igemm_mode1']
+    dataset_paths: list[str] = [str(Path(__file__).parent.parent / "data" / "kitti")]
+    baselines: list[str] = ["ts", "fvdb::igemm_mode1"]
 
     def _make_model(self, baseline: str) -> nn.Module:
         from fvdb_benchmark.model.updown import UpDown
-        return UpDown(
-            backend=baseline,
-            factor=2
-        )
+
+        return UpDown(backend=baseline, factor=2)
 
 
 all_configs = {
-    'single_conv': SingleConvConfig,
-    'grid_building': GridBuildingConfig,
-    'xcube': XCubeConfig,
-    'kitti_segmentation': KITTISegmentationConfig,
-    'updown': UpDownConfig,
+    "single_conv": SingleConvConfig,
+    "grid_building": GridBuildingConfig,
+    "xcube": XCubeConfig,
+    "kitti_segmentation": KITTISegmentationConfig,
+    "updown": UpDownConfig,
 }
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/dataset.py b/fvdb/tests/benchmark/fvdb_benchmark/dataset.py
index c71a81468f..d768276cb1 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/dataset.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/dataset.py
@@ -11,14 +11,16 @@
 
 
 class CoordsDataset(Dataset):
-    """ Loading coordinates from the dataset """
+    """Loading coordinates from the dataset"""
 
-    def __init__(self,
-                 paths: List[str],
-                 in_channels: int = 32,
-                 pad_level: int = 0,
-                 max_files: int = -1,
-                 positive_only: bool = True):
+    def __init__(
+        self,
+        paths: List[str],
+        in_channels: int = 32,
+        pad_level: int = 0,
+        max_files: int = -1,
+        positive_only: bool = True,
+    ):
         super().__init__()
         self.pad_level = pad_level
         self.in_channels = in_channels
@@ -42,20 +44,20 @@ def _get_files(self, base_path: Path) -> List[Path]:
             for npy_file in base_path.glob("*.npy"):
                 all_paths.append(npy_file)
             if self.max_files > 0:
-                return all_paths[:self.max_files]
+                return all_paths[: self.max_files]
             return all_paths
 
     def __len__(self):
         return len(self.data_paths)
 
     def load_grid(self, path) -> GridBatch:
-        if path.suffix == '.pkl':
+        if path.suffix == ".pkl":
             input_data = torch.load(path)
-            input_points: GridBatch = input_data['grid']
-        elif path.suffix == '.csv':
-            input_data = np.loadtxt(path, delimiter=',').astype(int)
+            input_points: GridBatch = input_data["grid"]
+        elif path.suffix == ".csv":
+            input_data = np.loadtxt(path, delimiter=",").astype(int)
             input_points: GridBatch = sparse_grid_from_ijk(torch.from_numpy(input_data))
-        elif path.suffix == '.npy':
+        elif path.suffix == ".npy":
             input_data = np.load(path).astype(int)
             input_points: GridBatch = sparse_grid_from_ijk(torch.from_numpy(input_data))
         else:
@@ -69,9 +71,7 @@ def __getitem__(self, index):
         # Pad data.
         if self.pad_level > 0:
             input_grid = sparse_grid_from_ijk(
-                input_grid.ijk,
-                pad_min=[-self.pad_level] * 3,
-                pad_max=[self.pad_level] * 3
+                input_grid.ijk, pad_min=[-self.pad_level] * 3, pad_max=[self.pad_level] * 3
             )
 
         # Make sure the coordinates are positive.
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/model/minkunet.py b/fvdb/tests/benchmark/fvdb_benchmark/model/minkunet.py
index a862cc25f1..3257443a1f 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/model/minkunet.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/model/minkunet.py
@@ -9,12 +9,7 @@
 
 
 class SparseResBlock(nn.Module):
-    def __init__(self,
-                 name: str,
-                 in_channels: int,
-                 out_channels: int,
-                 kernel_size: int,
-                 backend: str = 'ts'):
+    def __init__(self, name: str, in_channels: int, out_channels: int, kernel_size: int, backend: str = "ts"):
 
         super().__init__()
         self.backend = backend
@@ -22,29 +17,16 @@ def __init__(self,
         self.name = name
 
         self.net = self.wrapper.sequential(
-            self.wrapper.conv3d(
-                f"{name}_conv1",
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=1),
+            self.wrapper.conv3d(f"{name}_conv1", in_channels, out_channels, kernel_size, stride=1),
             self.wrapper.bn(out_channels),
             self.wrapper.relu(True),
-            self.wrapper.conv3d(
-                f"{name}_conv2",
-                out_channels,
-                out_channels,
-                kernel_size),
-            self.wrapper.bn(out_channels)
+            self.wrapper.conv3d(f"{name}_conv2", out_channels, out_channels, kernel_size),
+            self.wrapper.bn(out_channels),
         )
 
         if in_channels != out_channels:
             self.downsample = self.wrapper.sequential(
-                self.wrapper.conv3d(
-                    f"{name}_iconv",
-                    in_channels,
-                    out_channels,
-                    1),
+                self.wrapper.conv3d(f"{name}_iconv", in_channels, out_channels, 1),
                 self.wrapper.bn(out_channels),
             )
         else:
@@ -57,15 +39,10 @@ def forward(self, x):
             return self.relu(self.net(x) + self.downsample(x))
 
 
-
 class MinkUNet(nn.Module):
     """Minkowski U-Net for 3D semantic segmentation."""
 
-    def __init__(self,
-                 backend: str,
-                 in_channels: int = 4,
-                 cr: float = 1.0,
-                 num_classes: int = 50):
+    def __init__(self, backend: str, in_channels: int = 4, cr: float = 1.0, num_classes: int = 50):
         super().__init__()
 
         cs = [64, 64, 64, 128, 256, 256, 128, 64, 64]
@@ -82,9 +59,12 @@ def __init__(self,
 
         self.stem = self.wrapper.sequential(
             self.wrapper.conv3d("pre", in_channels, cs[0], kernel_size=3, stride=1),
-            self.wrapper.bn(cs[0]), self.wrapper.relu(True),
+            self.wrapper.bn(cs[0]),
+            self.wrapper.relu(True),
             self.wrapper.conv3d("pre", cs[0], cs[0], kernel_size=3, stride=1),
-            self.wrapper.bn(cs[0]), self.wrapper.relu(True))
+            self.wrapper.bn(cs[0]),
+            self.wrapper.relu(True),
+        )
 
         self.stage1 = nn.Sequential(
             self.wrapper.maxpool("down2", 2),
@@ -95,7 +75,8 @@ def __init__(self,
         self.stage2 = nn.Sequential(
             self.wrapper.maxpool("down4", 2),
             SparseResBlock("block4a", cs[1], cs[2], kernel_size=3, backend=self.backend),
-            SparseResBlock("block4b", cs[2], cs[2], kernel_size=3, backend=self.backend))
+            SparseResBlock("block4b", cs[2], cs[2], kernel_size=3, backend=self.backend),
+        )
 
         self.stage3 = nn.Sequential(
             self.wrapper.maxpool("down8", 2),
@@ -109,46 +90,54 @@ def __init__(self,
             SparseResBlock("block16b", cs[4], cs[4], kernel_size=3, backend=self.backend),
         )
 
-        self.up1 = nn.ModuleList([
-            self.wrapper.upsample("up8", 2),
-            nn.Sequential(
-                SparseResBlock("block8c", cs[5] + cs[3], cs[5], kernel_size=3, backend=self.backend),
-                SparseResBlock("block8d", cs[5], cs[5], kernel_size=3, backend=self.backend),
-            )
-        ])
-
-        self.up2 = nn.ModuleList([
-            self.wrapper.sequential(
-                self.wrapper.upsample("up4", 2),
-                self.wrapper.conv3d("block4c", cs[5], cs[6], kernel_size=1, stride=1, bias=False),
-            ),
-            nn.Sequential(
-                SparseResBlock("block4d", cs[6] + cs[2], cs[6], kernel_size=3, backend=self.backend),
-                SparseResBlock("block4e", cs[6], cs[6], kernel_size=3, backend=self.backend),
-            )
-        ])
-
-        self.up3 = nn.ModuleList([
-            self.wrapper.sequential(
-                self.wrapper.upsample("up2", 2),
-                self.wrapper.conv3d("block2c", cs[6], cs[7], kernel_size=1, stride=1, bias=False),
-            ),
-            nn.Sequential(
-                SparseResBlock("block2d", cs[7] + cs[1], cs[7], kernel_size=3, backend=self.backend),
-                SparseResBlock("block2e", cs[7], cs[7], kernel_size=3, backend=self.backend),
-            )
-        ])
-
-        self.up4 = nn.ModuleList([
-            self.wrapper.sequential(
-                self.wrapper.upsample("up1", 2),
-                self.wrapper.conv3d("block1c", cs[7], cs[8], kernel_size=1, stride=1, bias=False),
-            ),
-            nn.Sequential(
-                SparseResBlock("block1d", cs[8] + cs[0], cs[8], kernel_size=3, backend=self.backend),
-                SparseResBlock("block1e", cs[8], cs[8], kernel_size=3, backend=self.backend),
-            )
-        ])
+        self.up1 = nn.ModuleList(
+            [
+                self.wrapper.upsample("up8", 2),
+                nn.Sequential(
+                    SparseResBlock("block8c", cs[5] + cs[3], cs[5], kernel_size=3, backend=self.backend),
+                    SparseResBlock("block8d", cs[5], cs[5], kernel_size=3, backend=self.backend),
+                ),
+            ]
+        )
+
+        self.up2 = nn.ModuleList(
+            [
+                self.wrapper.sequential(
+                    self.wrapper.upsample("up4", 2),
+                    self.wrapper.conv3d("block4c", cs[5], cs[6], kernel_size=1, stride=1, bias=False),
+                ),
+                nn.Sequential(
+                    SparseResBlock("block4d", cs[6] + cs[2], cs[6], kernel_size=3, backend=self.backend),
+                    SparseResBlock("block4e", cs[6], cs[6], kernel_size=3, backend=self.backend),
+                ),
+            ]
+        )
+
+        self.up3 = nn.ModuleList(
+            [
+                self.wrapper.sequential(
+                    self.wrapper.upsample("up2", 2),
+                    self.wrapper.conv3d("block2c", cs[6], cs[7], kernel_size=1, stride=1, bias=False),
+                ),
+                nn.Sequential(
+                    SparseResBlock("block2d", cs[7] + cs[1], cs[7], kernel_size=3, backend=self.backend),
+                    SparseResBlock("block2e", cs[7], cs[7], kernel_size=3, backend=self.backend),
+                ),
+            ]
+        )
+
+        self.up4 = nn.ModuleList(
+            [
+                self.wrapper.sequential(
+                    self.wrapper.upsample("up1", 2),
+                    self.wrapper.conv3d("block1c", cs[7], cs[8], kernel_size=1, stride=1, bias=False),
+                ),
+                nn.Sequential(
+                    SparseResBlock("block1d", cs[8] + cs[0], cs[8], kernel_size=3, backend=self.backend),
+                    SparseResBlock("block1e", cs[8], cs[8], kernel_size=3, backend=self.backend),
+                ),
+            ]
+        )
 
         self.classifier = nn.Sequential(nn.Linear(cs[8], num_classes))
 
@@ -159,7 +148,7 @@ def forward(self, x):
         x3 = self.stage3(x2)
         x4 = self.stage4(x3)
 
-        if self.wrapper.backend == 'fvdb':
+        if self.wrapper.backend == "fvdb":
             y1 = self.up1[0](x4, ref_fine_data=x3)
         else:
             y1 = self.up1[0](x4)
@@ -167,28 +156,28 @@ def forward(self, x):
         y1 = self.wrapper.cat([y1, x3])
         y1 = self.up1[1](y1)
 
-        if self.wrapper.backend == 'fvdb':
+        if self.wrapper.backend == "fvdb":
             y2 = self.up2[0](y1, ref_fine_data=x2)
         else:
             y2 = self.up2[0](y1)
         y2 = self.wrapper.cat([y2, x2])
         y2 = self.up2[1](y2)
 
-        if self.wrapper.backend == 'fvdb':
+        if self.wrapper.backend == "fvdb":
             y3 = self.up3[0](y2, ref_fine_data=x1)
         else:
             y3 = self.up3[0](y2)
         y3 = self.wrapper.cat([y3, x1])
         y3 = self.up3[1](y3)
 
-        if self.wrapper.backend == 'fvdb':
+        if self.wrapper.backend == "fvdb":
             y4 = self.up4[0](y3, ref_fine_data=x0)
         else:
             y4 = self.up4[0](y3)
         y4 = self.wrapper.cat([y4, x0])
         y4 = self.up4[1](y4)
 
-        if self.wrapper.backend == 'fvdb':
+        if self.wrapper.backend == "fvdb":
             out = self.classifier(y4.feature.jdata)
         else:
             out = self.classifier(y4.F)
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/model/updown.py b/fvdb/tests/benchmark/fvdb_benchmark/model/updown.py
index f112b61c48..dac8d8e620 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/model/updown.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/model/updown.py
@@ -12,9 +12,7 @@
 
 class UpDown(nn.Module):
 
-    def __init__(self,
-                 backend: str,
-                 factor: int = 2):
+    def __init__(self, backend: str, factor: int = 2):
         super().__init__()
 
         self.backend = backend
@@ -26,12 +24,12 @@ def __init__(self,
 
     def forward(self, x):
         # Isolate grid creation time for fvdb
-        if self.backend == 'fvdb':
+        if self.backend == "fvdb":
             with record_function(encode_range_name("grid_creation", self.backend, {})):
                 assert isinstance(x, fvdbnn.VDBTensor)
                 up_grid = x.grid.subdivided_grid(self.factor)
-                up_kwargs = {'ref_fine_data': up_grid}
-                down_kargs = {'ref_coarse_data': x.grid}
+                up_kwargs = {"ref_fine_data": up_grid}
+                down_kargs = {"ref_coarse_data": x.grid}
         else:
             up_kwargs = down_kargs = {}
 
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/model/xcube.py b/fvdb/tests/benchmark/fvdb_benchmark/model/xcube.py
index 1fbac750d8..06c363db71 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/model/xcube.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/model/xcube.py
@@ -13,14 +13,12 @@ def __init__(self, name: str, in_channels: int, out_channels: int, order: str, b
         self.wrapper = Wrapper(backend=backend)
 
         for i, char in enumerate(order):
-            if char == 'r':
-                self.add_module('ReLU', self.wrapper.relu(inplace=True))
-            elif char == 'c':
-                self.add_module('Conv', self.wrapper.conv3d(
-                    f"{name}_conv_{i}",
-                    in_channels,
-                    out_channels, 3, 1,
-                    bias='g' not in order)
+            if char == "r":
+                self.add_module("ReLU", self.wrapper.relu(inplace=True))
+            elif char == "c":
+                self.add_module(
+                    "Conv",
+                    self.wrapper.conv3d(f"{name}_conv_{i}", in_channels, out_channels, 3, 1, bias="g" not in order),
                 )
             else:
                 raise NotImplementedError
@@ -32,29 +30,27 @@ def __init__(self, name: str, in_channels: int, out_channels: int, order: str, b
 
         self.wrapper = Wrapper(backend=backend)
 
-        self.add_module('SingleConv', ConvBlock(
-            f"{name}_conv_block", in_channels, in_channels, order, backend
-        ))
+        self.add_module("SingleConv", ConvBlock(f"{name}_conv_block", in_channels, in_channels, order, backend))
         mid_channels = in_channels
         if out_channels > mid_channels:
             mid_channels = out_channels
 
-        self.add_module('OutConv-1', self.wrapper.conv3d(
-            f"{name}_oconv1", in_channels, mid_channels, 1, bias=True))
-        self.add_module('ReLU-1', self.wrapper.relu(inplace=True))
-        self.add_module('OutConv', self.wrapper.conv3d(
-            f"{name}_oconv2", mid_channels, out_channels, 1, bias=True))
+        self.add_module("OutConv-1", self.wrapper.conv3d(f"{name}_oconv1", in_channels, mid_channels, 1, bias=True))
+        self.add_module("ReLU-1", self.wrapper.relu(inplace=True))
+        self.add_module("OutConv", self.wrapper.conv3d(f"{name}_oconv2", mid_channels, out_channels, 1, bias=True))
 
 
 class SparseDoubleConv(nn.Sequential):
-    def __init__(self,
-                 name: str,
-                 in_channels: int,
-                 out_channels: int,
-                 order: str,
-                 encoder: bool,
-                 pooling = None,
-                 backend: str = 'ts'):
+    def __init__(
+        self,
+        name: str,
+        in_channels: int,
+        out_channels: int,
+        order: str,
+        encoder: bool,
+        pooling=None,
+        backend: str = "ts",
+    ):
         super().__init__()
 
         self.wrapper = Wrapper(backend=backend)
@@ -64,28 +60,25 @@ def __init__(self,
             if conv1_out_channels < in_channels:
                 conv1_out_channels = in_channels
             conv2_in_channels, conv2_out_channels = conv1_out_channels, out_channels
-            if pooling == 'max':
-                self.add_module('MaxPool', self.wrapper.maxpool(f"{name}_maxpool", 2))
+            if pooling == "max":
+                self.add_module("MaxPool", self.wrapper.maxpool(f"{name}_maxpool", 2))
         else:
             conv1_in_channels, conv1_out_channels = in_channels, out_channels
             conv2_in_channels, conv2_out_channels = out_channels, out_channels
 
-        self.add_module('SingleConv1', ConvBlock(f"{name}_cb1", conv1_in_channels, conv1_out_channels, order, backend))
-        self.add_module('SingleConv3', ConvBlock(f"{name}_cb2", conv1_out_channels, conv1_in_channels, order, backend))
-        self.add_module('SingleConv4', ConvBlock(f"{name}_cb3", conv1_in_channels, conv1_out_channels, order, backend))
-        self.add_module('SingleConv2', ConvBlock(f"{name}_cb4", conv2_in_channels, conv2_out_channels, order, backend))
+        self.add_module("SingleConv1", ConvBlock(f"{name}_cb1", conv1_in_channels, conv1_out_channels, order, backend))
+        self.add_module("SingleConv3", ConvBlock(f"{name}_cb2", conv1_out_channels, conv1_in_channels, order, backend))
+        self.add_module("SingleConv4", ConvBlock(f"{name}_cb3", conv1_in_channels, conv1_out_channels, order, backend))
+        self.add_module("SingleConv2", ConvBlock(f"{name}_cb4", conv2_in_channels, conv2_out_channels, order, backend))
 
 
 class XCubeVAE(nn.Module):
-    def __init__(self,
-                 in_channels: int,
-                 num_blocks: int = 4,
-                 f_maps: int = 64,
-                 order: str = 'cr',
-                 backend='ts', **kwargs):
+    def __init__(
+        self, in_channels: int, num_blocks: int = 4, f_maps: int = 64, order: str = "cr", backend="ts", **kwargs
+    ):
 
         super().__init__()
-        n_features = [in_channels] + [f_maps * 2 ** k for k in range(num_blocks)]
+        n_features = [in_channels] + [f_maps * 2**k for k in range(num_blocks)]
 
         self.encoders = nn.ModuleList()
         self.pre_kl_bottleneck = nn.ModuleList()
@@ -100,37 +93,42 @@ def __init__(self,
         # Encoder
         self.pre_conv = self.wrapper.conv3d("pre_conv", in_channels, in_channels, 1, 1)
         for layer_idx in range(num_blocks):
-            self.encoders.add_module(f'Enc{layer_idx}', SparseDoubleConv(
+            self.encoders.add_module(
                 f"Enc{layer_idx}",
-                n_features[layer_idx],
-                n_features[layer_idx + 1],
-                order,
-                True,
-                'max' if layer_idx > 0 else None,
-                backend=backend
-            ))
+                SparseDoubleConv(
+                    f"Enc{layer_idx}",
+                    n_features[layer_idx],
+                    n_features[layer_idx + 1],
+                    order,
+                    True,
+                    "max" if layer_idx > 0 else None,
+                    backend=backend,
+                ),
+            )
 
         # Bottleneck
-        self.pre_kl_bottleneck.add_module(f'pre_kl_bottleneck_0', SparseDoubleConv(
-            "pre_kl_bottleneck_0",
-            n_features[-1], n_features[-1], order, False, backend=backend))
-        self.post_kl_bottleneck.add_module(f'post_kl_bottleneck_1', SparseDoubleConv(
-            "post_kl_bottleneck_1",
-            n_features[-1], n_features[-1], order, False, backend=backend))
+        self.pre_kl_bottleneck.add_module(
+            f"pre_kl_bottleneck_0",
+            SparseDoubleConv("pre_kl_bottleneck_0", n_features[-1], n_features[-1], order, False, backend=backend),
+        )
+        self.post_kl_bottleneck.add_module(
+            f"post_kl_bottleneck_1",
+            SparseDoubleConv("post_kl_bottleneck_1", n_features[-1], n_features[-1], order, False, backend=backend),
+        )
 
         # Decoder
         for layer_idx in range(-1, -num_blocks - 1, -1):
-            self.struct_convs.add_module(f'Struct{layer_idx}', SparseHead(
-                f'Struct{layer_idx}',
-                n_features[layer_idx], 2, order, backend))
+            self.struct_convs.add_module(
+                f"Struct{layer_idx}", SparseHead(f"Struct{layer_idx}", n_features[layer_idx], 2, order, backend)
+            )
             if layer_idx < -1:
-                self.decoders.add_module(f'Dec{layer_idx}', SparseDoubleConv(
-                    f'Dec{layer_idx}',
-                    n_features[layer_idx + 1],
-                    n_features[layer_idx],
-                    order, False, None, backend
-                ))
-                self.upsamplers.add_module(f'Up{layer_idx}', self.wrapper.upsample_generative(f"up-{layer_idx}", 2))
+                self.decoders.add_module(
+                    f"Dec{layer_idx}",
+                    SparseDoubleConv(
+                        f"Dec{layer_idx}", n_features[layer_idx + 1], n_features[layer_idx], order, False, None, backend
+                    ),
+                )
+                self.upsamplers.add_module(f"Up{layer_idx}", self.wrapper.upsample_generative(f"up-{layer_idx}", 2))
 
     def forward(self, x, gt_coords: dict):
         x = self.pre_conv(x)
@@ -148,7 +146,8 @@ def forward(self, x, gt_coords: dict):
         feat_depth = self.num_blocks - 1
 
         for module, upsampler, struct_conv in zip(
-                [None] + list(self.decoders), [None] + list(self.upsamplers), self.struct_convs):
+            [None] + list(self.decoders), [None] + list(self.upsamplers), self.struct_convs
+        ):
             if module is not None:
                 x = upsampler(x, struct_decision)
                 x = module(x)
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/utils.py b/fvdb/tests/benchmark/fvdb_benchmark/utils.py
index 06d8c8ce74..25a85c2130 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/utils.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/utils.py
@@ -57,8 +57,10 @@ def df_to_table(
     return rich_table
 
 
-L2_CACHE_TABLE = {'NVIDIA RTX 6000 Ada Generation':96,
-                'NVIDIA GeForce RTX 4090':72,}
+L2_CACHE_TABLE = {
+    "NVIDIA RTX 6000 Ada Generation": 96,
+    "NVIDIA GeForce RTX 4090": 72,
+}
 L2_CACHE_BUFFER = None
 
 
@@ -70,8 +72,8 @@ def create_l2_cache():
     if device_name not in L2_CACHE_TABLE:
         raise NotImplementedError(f"Cache size for {device_name} is not known.")
 
-    L2_CACHE_SIZE = L2_CACHE_TABLE[device_name] # MB.
-    L2_CACHE_BUFFER = torch.empty(int(L2_CACHE_SIZE * (1024 ** 2)), dtype=torch.int8, device='cuda')
+    L2_CACHE_SIZE = L2_CACHE_TABLE[device_name]  # MB.
+    L2_CACHE_BUFFER = torch.empty(int(L2_CACHE_SIZE * (1024**2)), dtype=torch.int8, device="cuda")
 
 
 def flush_l2_cache():
@@ -83,14 +85,13 @@ def current_gpu_memory_usage() -> Optional[int]:
     try:
         # Run the nvidia-smi command and capture the output
         result = subprocess.run(
-            ['nvidia-smi', '--query-gpu=memory.used', '--format=csv,noheader,nounits'],
-            capture_output=True, text=True
+            ["nvidia-smi", "--query-gpu=memory.used", "--format=csv,noheader,nounits"], capture_output=True, text=True
         )
 
         # Check if the command was successful
         if result.returncode == 0:
             # Parse the output to extract memory usage
-            memory_usage = [int(value.strip()) for value in result.stdout.strip().split('\n')]
+            memory_usage = [int(value.strip()) for value in result.stdout.strip().split("\n")]
 
             # Return the memory usage
             return memory_usage[0]
diff --git a/fvdb/tests/benchmark/fvdb_benchmark/wrapper.py b/fvdb/tests/benchmark/fvdb_benchmark/wrapper.py
index 18fc1a51fa..91f1d1fe73 100644
--- a/fvdb/tests/benchmark/fvdb_benchmark/wrapper.py
+++ b/fvdb/tests/benchmark/fvdb_benchmark/wrapper.py
@@ -27,22 +27,25 @@ def __init__(self, stride: int):
         self.stride = stride
 
     def forward(self, input):
-        out_coords = spF.spdownsample(
-            input.coords, stride=2, kernel_size=2, downsample_mode="minkowski")
+        out_coords = spF.spdownsample(input.coords, stride=2, kernel_size=2, downsample_mode="minkowski")
         offsets = get_kernel_offsets(2, 1, 1, device=input.feats.device)
 
         references = spF.sphash(input.coords)
         queries = spF.sphash(out_coords * 2, offsets)
         results = spF.sphashquery(queries, references)
 
-        in_feature = torch.cat([
-            torch.full((1, input.feats.size(1)), fill_value=-1000.0,
-                    device=input.feats.device, dtype=input.feats.dtype), input.feats
-        ], dim=0)
-        out_feature = torch.max(in_feature[results + 1], dim=0)[0]       # [8, N, C] -> [N, C]
+        in_feature = torch.cat(
+            [
+                torch.full(
+                    (1, input.feats.size(1)), fill_value=-1000.0, device=input.feats.device, dtype=input.feats.dtype
+                ),
+                input.feats,
+            ],
+            dim=0,
+        )
+        out_feature = torch.max(in_feature[results + 1], dim=0)[0]  # [8, N, C] -> [N, C]
 
-        output = SparseTensor(feats=out_feature, coords=out_coords,
-                            stride=tuple(input.stride[k] * 2 for k in range(3)))
+        output = SparseTensor(feats=out_feature, coords=out_coords, stride=tuple(input.stride[k] * 2 for k in range(3)))
         output._caches = input._caches
         output._caches.cmaps.setdefault(output.stride, output.coords)
 
@@ -67,8 +70,9 @@ def forward(self, input, mask=None):
         for i in [0, 1]:
             for j in [0, 1]:
                 for k in [0, 1]:
-                    out_coords.append(in_coords + torch.tensor(
-                        [[0, i, j, k]], device=in_coords.device, dtype=in_coords.dtype))
+                    out_coords.append(
+                        in_coords + torch.tensor([[0, i, j, k]], device=in_coords.device, dtype=in_coords.dtype)
+                    )
                     out_feature.append(F)
         out_coords = torch.cat(out_coords, dim=0)
         out_feature = torch.cat(out_feature, dim=0)
@@ -91,7 +95,7 @@ def forward(self, input):
             out_coords = out_coords[0]
 
         in_feature = input.feats
-        out_in_pos = torch.div(out_coords[:, 1:4], 2, rounding_mode='floor')
+        out_in_pos = torch.div(out_coords[:, 1:4], 2, rounding_mode="floor")
         out_in_coord = torch.cat([out_coords[:, :1], out_in_pos], dim=1).int()
 
         queries = spF.sphash(out_in_coord)
@@ -112,142 +116,126 @@ class Wrapper:
     def record_runtime(cls, module, name: str):
         # Register class as a module in fvdb.nn
         old_forward = module.forward
+
         def _forward(self, *args, **kwargs):
             with record_function(name):
                 return old_forward(self, *args, **kwargs)
+
         module.forward = _forward
         return module
 
-    def __init__(self, backend: str = 'ts') -> None:
-        assert backend in ['ts', 'fvdb']
+    def __init__(self, backend: str = "ts") -> None:
+        assert backend in ["ts", "fvdb"]
         self.backend = backend
 
-    def conv3d(self,
-               name: str,
-               in_channels: int,
-               out_channels: int,
-               kernel_size: int = 3,
-               stride: int = 1,
-               dilation: int = 1,
-               padding: int = 0,
-               bias: bool = False,
-               transpose: bool = False):
-        if self.backend == 'ts':
-            return self.record_runtime(spnn.Conv3d(
-                    in_channels, out_channels, kernel_size, stride,
-                    padding, dilation, bias, transpose
-                ),
-                encode_range_name(name, 'ts', {"I": in_channels, "O": out_channels})
+    def conv3d(
+        self,
+        name: str,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int = 3,
+        stride: int = 1,
+        dilation: int = 1,
+        padding: int = 0,
+        bias: bool = False,
+        transpose: bool = False,
+    ):
+        if self.backend == "ts":
+            return self.record_runtime(
+                spnn.Conv3d(in_channels, out_channels, kernel_size, stride, padding, dilation, bias, transpose),
+                encode_range_name(name, "ts", {"I": in_channels, "O": out_channels}),
             )
 
-        elif self.backend == 'fvdb':
+        elif self.backend == "fvdb":
             assert dilation == 1, "fvdb does not support dilation"
             assert padding == 0, "fvdb does not support padding"
-            return self.record_runtime(fvnn.SparseConv3d(
-                    in_channels, out_channels, kernel_size, stride, bias, transpose
-                ),
-                encode_range_name(name, 'fvdb', {"I": in_channels, "O": out_channels})
+            return self.record_runtime(
+                fvnn.SparseConv3d(in_channels, out_channels, kernel_size, stride, bias, transpose),
+                encode_range_name(name, "fvdb", {"I": in_channels, "O": out_channels}),
             )
 
         else:
             raise NotImplementedError
 
-    def bn(self, num_features: int,
-           *,
-           eps: float = 1e-5,
-           momentum: float = 0.1):
-        if self.backend == 'ts':
-            return self.record_runtime(spnn.BatchNorm(
-                    num_features, eps, momentum
-                ),
-                encode_range_name('bn', 'ts', {"F": num_features})
+    def bn(self, num_features: int, *, eps: float = 1e-5, momentum: float = 0.1):
+        if self.backend == "ts":
+            return self.record_runtime(
+                spnn.BatchNorm(num_features, eps, momentum), encode_range_name("bn", "ts", {"F": num_features})
             )
 
-        elif self.backend == 'fvdb':
-            return self.record_runtime(fvnn.BatchNorm(
-                    num_features, eps, momentum
-                ),
-                encode_range_name('bn', 'fvdb', {"F": num_features})
+        elif self.backend == "fvdb":
+            return self.record_runtime(
+                fvnn.BatchNorm(num_features, eps, momentum), encode_range_name("bn", "fvdb", {"F": num_features})
             )
 
         else:
             raise NotImplementedError
 
     def relu(self, inplace: bool = True):
-        if self.backend == 'ts':
-            return self.record_runtime(spnn.ReLU(inplace),
-                encode_range_name('relu', 'ts', {})
-            )
+        if self.backend == "ts":
+            return self.record_runtime(spnn.ReLU(inplace), encode_range_name("relu", "ts", {}))
 
-        elif self.backend == 'fvdb':
-            return self.record_runtime(fvnn.ReLU(inplace),
-                encode_range_name('relu', 'fvdb', {})
-            )
+        elif self.backend == "fvdb":
+            return self.record_runtime(fvnn.ReLU(inplace), encode_range_name("relu", "fvdb", {}))
 
         else:
             raise NotImplementedError
 
     def cat(self, *args):
-        range_name = encode_range_name('cat', self.backend, {})
+        range_name = encode_range_name("cat", self.backend, {})
         with record_function(range_name):
-            if self.backend == 'ts':
+            if self.backend == "ts":
                 return torchsparse.cat(*args)
 
-            elif self.backend == 'fvdb':
+            elif self.backend == "fvdb":
                 return fvnn.cat(*args, dim=1)
 
             else:
                 raise NotImplementedError
 
     def maxpool(self, name: str, stride: int):
-        if self.backend == 'ts':
-            return self.record_runtime(TorchsparseMaxPool(stride),
-                encode_range_name(name, 'ts', {"S": stride})
-            )
+        if self.backend == "ts":
+            return self.record_runtime(TorchsparseMaxPool(stride), encode_range_name(name, "ts", {"S": stride}))
 
-        elif self.backend == 'fvdb':
-            return self.record_runtime(fvnn.MaxPool(stride),
-                encode_range_name(name, 'fvdb', {"S": stride})
-            )
+        elif self.backend == "fvdb":
+            return self.record_runtime(fvnn.MaxPool(stride), encode_range_name(name, "fvdb", {"S": stride}))
 
         else:
             raise NotImplementedError
 
     def upsample_generative(self, name: str, scale_factor: int):
-        if self.backend == 'ts':
+        if self.backend == "ts":
             assert scale_factor == 2, "Only support scale factor 2"
-            return self.record_runtime(TorchsparseUpsampleGenerative(),
-                encode_range_name(name, 'ts', {"S": scale_factor})
+            return self.record_runtime(
+                TorchsparseUpsampleGenerative(), encode_range_name(name, "ts", {"S": scale_factor})
             )
 
-        elif self.backend == 'fvdb':
-            return self.record_runtime(fvnn.UpsamplingNearest(scale_factor),
-                encode_range_name(name, 'fvdb', {"S": scale_factor})
+        elif self.backend == "fvdb":
+            return self.record_runtime(
+                fvnn.UpsamplingNearest(scale_factor), encode_range_name(name, "fvdb", {"S": scale_factor})
             )
 
         else:
             raise NotImplementedError
 
     def upsample(self, name: str, scale_factor: int):
-        if self.backend == 'ts':
+        if self.backend == "ts":
             assert scale_factor == 2, "Only support scale factor 2"
-            return self.record_runtime(TorchsparseUpsample(),
-                encode_range_name(name, 'ts', {"S": scale_factor})
-            )
+            return self.record_runtime(TorchsparseUpsample(), encode_range_name(name, "ts", {"S": scale_factor}))
 
-        elif self.backend == 'fvdb':
-            return self.record_runtime(fvnn.UpsamplingNearest(scale_factor),
-                encode_range_name(name, 'fvdb', {"S": scale_factor})
+        elif self.backend == "fvdb":
+            return self.record_runtime(
+                fvnn.UpsamplingNearest(scale_factor), encode_range_name(name, "fvdb", {"S": scale_factor})
             )
 
         else:
             raise NotImplementedError
 
     def get_coords(self, x):
-        if self.backend == 'ts':
+        if self.backend == "ts":
             return x.coords[:, 1:4]
 
-        elif self.backend == 'fvdb':
+        elif self.backend == "fvdb":
             return x.grid.ijk.jdata
 
         else:
@@ -259,10 +247,10 @@ def get_mask(self, x, coords: fvdb.GridBatch):
         return coords.ijk_to_index(cur_coords).jdata != -1
 
     def sequential(self, *args):
-        if self.backend == 'ts':
+        if self.backend == "ts":
             return nn.Sequential(*args)
 
-        elif self.backend == 'fvdb':
+        elif self.backend == "fvdb":
 
             class SequentialFVDB(nn.Sequential):
                 def forward(self, input, ref_fine_data: Optional[fvnn.VDBTensor] = None):
diff --git a/fvdb/tests/benchmark/test_conv.py b/fvdb/tests/benchmark/test_conv.py
index 761a4b9ac7..17086a93fd 100644
--- a/fvdb/tests/benchmark/test_conv.py
+++ b/fvdb/tests/benchmark/test_conv.py
@@ -14,15 +14,17 @@
 
 PTS_CACHE = [torch.empty((10_000, 3), dtype=torch.float32).normal_() for _ in range(100)]
 
+
 @pytest.mark.parametrize("i_ch", [3, 8, 16, 32, 64, 128])
 @pytest.mark.parametrize("o_ch", [3, 8, 16, 32, 64, 128])
-@pytest.mark.parametrize("backend", ['default', 'cutlass', 'me', 'halo', 'igemm_mode0', 'igemm_mode1', 'igemm_mode2'])
-@pytest.mark.benchmark(group="sparse_conv3d",
-                       warmup=True,
-                       warmup_iterations=3,
- )
+@pytest.mark.parametrize("backend", ["default", "cutlass", "me", "halo", "igemm_mode0", "igemm_mode1", "igemm_mode2"])
+@pytest.mark.benchmark(
+    group="sparse_conv3d",
+    warmup=True,
+    warmup_iterations=3,
+)
 def test_forward_conv3d(benchmark, i_ch, o_ch, backend):
-    device = torch.device('cuda')
+    device = torch.device("cuda")
     pts = random.choice(PTS_CACHE).to(device=device) * 4
 
     coords = torch.floor(pts / 0.01).to(torch.int32)
@@ -41,4 +43,4 @@ def run_model():
         model(example_inputs)
 
     # benchmark(run_model)
-    benchmark.pedantic(run_model, iterations=10, rounds = 20)
\ No newline at end of file
+    benchmark.pedantic(run_model, iterations=10, rounds=20)
diff --git a/fvdb/tests/unit/common.py b/fvdb/tests/unit/common.py
index 01b99a5e44..32817d3fb2 100644
--- a/fvdb/tests/unit/common.py
+++ b/fvdb/tests/unit/common.py
@@ -19,15 +19,18 @@
 # Hack parameterized to use the function name and the expand parameters as the test name
 test_expand = functools.partial(
     parameterized.expand,
-    name_func=lambda f, n, p: f'{f.__name__}_{parameterized.to_safe_name("_".join(str(x) for x in p.args))}')
-
-
-def sparse_grid_from_dense_cube(resolution: NumberOrVec3,
-                                cube_min: Vec3d = (0., 0., 0.),
-                                cube_max: Vec3d = (1., 1., 1.),
-                                voxel_center: bool = False,
-                                mutable: bool = False,
-                                device: Union[torch.device, str] = 'cpu') -> GridBatch:
+    name_func=lambda f, n, p: f'{f.__name__}_{parameterized.to_safe_name("_".join(str(x) for x in p.args))}',
+)
+
+
+def sparse_grid_from_dense_cube(
+    resolution: NumberOrVec3,
+    cube_min: Vec3d = (0.0, 0.0, 0.0),
+    cube_max: Vec3d = (1.0, 1.0, 1.0),
+    voxel_center: bool = False,
+    mutable: bool = False,
+    device: Union[torch.device, str] = "cpu",
+) -> GridBatch:
     def _coord3d_to_tensor(coord: Vec3, dtype: torch.dtype = torch.float64) -> torch.Tensor:
         if not hasattr(coord, "__len__") or len(coord) != 3:
             raise ValueError("expected 3D coordinate")
@@ -37,7 +40,6 @@ def _coord3d_to_tensor(coord: Vec3, dtype: torch.dtype = torch.float64) -> torch
         else:
             return torch.tensor([c for c in coord], dtype=dtype)
 
-
     def _number_or_coord3d_to_tensor(coord_or_number: NumberOrVec3, dtype: torch.dtype = torch.float64) -> torch.Tensor:
         if isinstance(coord_or_number, (float, int)):
             return torch.tensor([coord_or_number] * 3, dtype=dtype)
@@ -64,7 +66,10 @@ def _number_or_coord3d_to_tensor(coord_or_number: NumberOrVec3, dtype: torch.dty
         voxel_size = (cube_max - cube_min) / resolution.to(torch.float64)
         origin = cube_min + 0.5 * voxel_size
 
-    return sparse_grid_from_dense(1, resolution, voxel_sizes=voxel_size, origins=origin, device=str(device), mutable=mutable)
+    return sparse_grid_from_dense(
+        1, resolution, voxel_sizes=voxel_size, origins=origin, device=str(device), mutable=mutable
+    )
+
 
 def random_drop_points_if_mutable(grid: GridBatch, drop_pct: float = 0.5):
     if grid.mutable:
@@ -101,7 +106,9 @@ def make_dense_grid_and_point_data(nvox, device, dtype, mutable):
     return fvdb, fvdb_d, p
 
 
-def make_sparse_grid_and_point_data(device, dtype, include_boundary_points: bool = False, expand: int = 10, mutable: bool = False):
+def make_sparse_grid_and_point_data(
+    device, dtype, include_boundary_points: bool = False, expand: int = 10, mutable: bool = False
+):
     p = torch.randn((100, 3), device=device, dtype=dtype)
     vox_size = 0.05
     grid = GridBatch(mutable=mutable, device=device)
@@ -144,4 +151,3 @@ def dtype_to_atol(dtype: torch.dtype) -> float:
     if dtype == torch.float64:
         return 1e-5
     raise ValueError("dtype must be a valid torch floating type")
-
diff --git a/fvdb/tests/unit/nkfw_api/backend/__init__.py b/fvdb/tests/unit/nkfw_api/backend/__init__.py
index da0891d1e2..b03094fabb 100644
--- a/fvdb/tests/unit/nkfw_api/backend/__init__.py
+++ b/fvdb/tests/unit/nkfw_api/backend/__init__.py
@@ -2,12 +2,15 @@
 # SPDX-License-Identifier: MPL-2.0
 #
 
+
 def load_backend(backend: str):
-    if backend == 'hash_table':
+    if backend == "hash_table":
         from backend import hash_table
+
         return hash_table
-    elif backend == 'fvdb':
+    elif backend == "fvdb":
         from backend import fvdb
+
         return fvdb
     else:
         raise NotImplementedError
diff --git a/fvdb/tests/unit/nkfw_api/backend/abc.py b/fvdb/tests/unit/nkfw_api/backend/abc.py
index 4de8f3d67c..63f97de9eb 100644
--- a/fvdb/tests/unit/nkfw_api/backend/abc.py
+++ b/fvdb/tests/unit/nkfw_api/backend/abc.py
@@ -84,8 +84,16 @@ def __repr__(self) -> str:
         pass
 
     @abstractmethod
-    def get_coords_neighbours(self, source_coords: torch.Tensor, source_stride: int, target_depth: int,
-                              nn_kernel: torch.Tensor, conv_based: bool = False, transposed: bool = False, raw: bool = False):
+    def get_coords_neighbours(
+        self,
+        source_coords: torch.Tensor,
+        source_stride: int,
+        target_depth: int,
+        nn_kernel: torch.Tensor,
+        conv_based: bool = False,
+        transposed: bool = False,
+        raw: bool = False,
+    ):
         """
         Get neighbourhood information of source_coords.
         :param source_coords:
@@ -99,8 +107,7 @@ def get_coords_neighbours(self, source_coords: torch.Tensor, source_stride: int,
         pass
 
     @abstractmethod
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int,
-                            conv_based: bool = False):
+    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
         """
 
         :param source_depth:
@@ -136,8 +143,14 @@ def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
         pass
 
     @abstractmethod
-    def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = None,
-                   check_corr: bool = True, return_nf_mask: bool = False):
+    def splat_data(
+        self,
+        xyz: torch.Tensor,
+        data_depth: int,
+        data: torch.Tensor = None,
+        check_corr: bool = True,
+        return_nf_mask: bool = False,
+    ):
         """
         Splat data located at xyz to the tree voxels.
         :param xyz: torch.Tensor (N, 3)
@@ -160,8 +173,14 @@ def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
         pass
 
     @abstractmethod
-    def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand: bool = False,
-                                  limit_adaptive_depth: int = 100, **policy_kwargs):
+    def build_hierarchy_subdivide(
+        self,
+        xyz: torch.Tensor,
+        subdivide_policy,
+        expand: bool = False,
+        limit_adaptive_depth: int = 100,
+        **policy_kwargs,
+    ):
         """
         Ignore for now
         :param xyz:
@@ -174,9 +193,14 @@ def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand:
         pass
 
     @abstractmethod
-    def build_hierarchy_adaptive(self, xyz: torch.Tensor, xyz_density: torch.Tensor, log_base: float = 4.0,
-                                 min_density: float = 8.0,
-                                 limit_adaptive_depth: int = 100) -> torch.Tensor:
+    def build_hierarchy_adaptive(
+        self,
+        xyz: torch.Tensor,
+        xyz_density: torch.Tensor,
+        log_base: float = 4.0,
+        min_density: float = 8.0,
+        limit_adaptive_depth: int = 100,
+    ) -> torch.Tensor:
         """
         Build the hierarchy by first determine the integer level of each point (based on xyz_density, log_base and
         min_density), then splat the points onto the tree structure.
@@ -206,8 +230,14 @@ def update_coords(self, depth: int, coords: Union[torch.Tensor, None]):
         pass
 
     @abstractmethod
-    def trilinear_interpolate(self, queries: torch.Tensor, depth: int, feature: torch.Tensor,
-                              feature_bg: torch.Tensor = None, compute_grad: bool = False):
+    def trilinear_interpolate(
+        self,
+        queries: torch.Tensor,
+        depth: int,
+        feature: torch.Tensor,
+        feature_bg: torch.Tensor = None,
+        compute_grad: bool = False,
+    ):
         """
         Trilinearly interpolate the features, this is very similar to self.splat.
             Maybe merge them in the future.
@@ -231,10 +261,14 @@ def get_visualization(self, stylized: bool = False, render_level: bool = False):
                 d_min, d_max = 0.1 * self.get_stride(d), 0.9 * self.get_stride(d)
             if self.get_coords(d).size(0) == 0:
                 continue
-            blk_wireframe = vis.wireframe_bbox((self.get_coords(d) + d_min) * self.voxel_size,
-                                               (self.get_coords(d) + d_max) * self.voxel_size,
-                                               solid=is_solid, ucid=d if stylized else -1, tube=render_level,
-                                               tube_radius=0.001)
+            blk_wireframe = vis.wireframe_bbox(
+                (self.get_coords(d) + d_min) * self.voxel_size,
+                (self.get_coords(d) + d_max) * self.voxel_size,
+                solid=is_solid,
+                ucid=d if stylized else -1,
+                tube=render_level,
+                tube_radius=0.001,
+            )
             if render_level and d == 0:
                 blk_wireframe = vis.transparent(blk_wireframe, alpha=0.5)
             tree_wireframes.append(blk_wireframe)
diff --git a/fvdb/tests/unit/nkfw_api/backend/fvdb.py b/fvdb/tests/unit/nkfw_api/backend/fvdb.py
index e32c6553b8..55d20932e8 100644
--- a/fvdb/tests/unit/nkfw_api/backend/fvdb.py
+++ b/fvdb/tests/unit/nkfw_api/backend/fvdb.py
@@ -14,8 +14,7 @@
 
 class SparseFeatureHierarchy(BaseBackend):
 
-    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1),
-                       (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
+    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
 
     def __init__(self, depth: int, voxel_size: float, device, range_kernel):
         super().__init__(depth, voxel_size, device, range_kernel)
@@ -24,7 +23,7 @@ def __init__(self, depth: int, voxel_size: float, device, range_kernel):
         self._voxel_size = voxel_size
         self._device = device
         self._range_kernel = range_kernel
-        self._vox_sizes = [voxel_size * (2 ** d) for d in range(depth)]
+        self._vox_sizes = [voxel_size * (2**d) for d in range(depth)]
         self._indexes = [fvdb.GridBatch(device=device) for d in range(self.depth)]
 
     @property
@@ -36,10 +35,10 @@ def voxel_size(self):
         return self._indexes[0].voxel_sizes[0]
 
     def get_stride(self, depth: int):
-        return 2 ** depth
+        return 2**depth
 
     def get_coords(self, depth: int, expand: int = 0, conforming: bool = False):
-        scale = 2 ** depth
+        scale = 2**depth
         if self._indexes[depth].total_voxels == 0:
             return torch.zeros(0, 3, device=self._device, dtype=torch.int32)
 
@@ -47,17 +46,16 @@ def get_coords(self, depth: int, expand: int = 0, conforming: bool = False):
 
         if expand >= 3:
             mc_offsets = self._range_kernel()(expand) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) +
-                           mc_offsets.unsqueeze(0)).view(-1, 3)
+            base_coords = (
+                base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) + mc_offsets.unsqueeze(0)
+            ).view(-1, 3)
             base_coords = torch_unique(base_coords, dim=0)
 
         if conforming:
-            base_coords = (base_coords / scale / 2.).floor().int() * scale * 2
+            base_coords = (base_coords / scale / 2.0).floor().int() * scale * 2
             base_coords = torch_unique(base_coords, dim=0)
-            conform_offsets = torch.tensor(
-                self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) +
-                           conform_offsets.unsqueeze(0)).view(-1, 3)
+            conform_offsets = torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
+            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
 
         return base_coords
 
@@ -65,37 +63,43 @@ def get_num_voxels(self, depth: int):
         return self._indexes[depth].total_voxels
 
     def get_voxel_centers(self, depth: int, normalized: bool = False):
-        return (self.get_coords(depth) + 2 ** depth / 2.) * \
-               (self._voxel_size if not normalized else 1.0)
+        return (self.get_coords(depth) + 2**depth / 2.0) * (self._voxel_size if not normalized else 1.0)
 
     def __repr__(self):
         return "fVDB"
 
-    def get_coords_neighbours(self, source_coords: torch.Tensor,
-                              source_stride: int, target_depth: int,
-                              nn_kernel: torch.Tensor, conv_based: bool = False,
-                              transposed: bool = False, raw: bool = False):
+    def get_coords_neighbours(
+        self,
+        source_coords: torch.Tensor,
+        source_stride: int,
+        target_depth: int,
+        nn_kernel: torch.Tensor,
+        conv_based: bool = False,
+        transposed: bool = False,
+        raw: bool = False,
+    ):
         assert 0 <= target_depth < self._depth
 
-        target_stride = 2 ** target_depth
+        target_stride = 2**target_depth
         if not conv_based:
             # Flaw: If the layers are different (source stride < target stride), you may end up with
             #   neighbours that has no overlap support.
             assert source_stride <= target_stride, "Data must be deeper and has more nodes."
             # Compute voxel center offsets.
-            quantized_source_coords = torch.div(
-                source_coords.detach() + 0.5 * source_stride, target_stride,
-                rounding_mode='floor').int() * target_stride
-            c_offset = (quantized_source_coords - source_coords) / source_stride + \
-                       (target_stride // source_stride - 1) / 2.
+            quantized_source_coords = (
+                torch.div(source_coords.detach() + 0.5 * source_stride, target_stride, rounding_mode="floor").int()
+                * target_stride
+            )
+            c_offset = (quantized_source_coords - source_coords) / source_stride + (
+                target_stride // source_stride - 1
+            ) / 2.0
         else:
             assert not source_coords.requires_grad
             assert source_stride >= target_stride, "Data must be sparser and shallower."
             quantized_source_coords = source_coords
 
         # (N, 3) x (K, 3) -> (K, N, 3)
-        queried_coords = quantized_source_coords.unsqueeze(0) + \
-                         (nn_kernel * 2 ** target_depth).unsqueeze(1)
+        queried_coords = quantized_source_coords.unsqueeze(0) + (nn_kernel * 2**target_depth).unsqueeze(1)
         hash_res = self._indexes[target_depth].ijk_to_index(queried_coords.reshape(-1, 3))
         hash_res = hash_res.jdata.reshape(-1, quantized_source_coords.size(0))
 
@@ -118,13 +122,12 @@ def get_coords_neighbours(self, source_coords: torch.Tensor,
 
         if not conv_based:
             neighbour_types = neighbour_types.float()
-            neighbour_types *= 2 ** target_depth / source_stride
+            neighbour_types *= 2**target_depth / source_stride
             neighbour_types += c_offset[source_ids, :3]
 
         return source_ids, target_ids, neighbour_types, nbsizes
 
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int,
-                            conv_based: bool = False):
+    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
         assert 0 <= source_depth < self.depth and 0 <= target_depth < self.depth
 
         # conv_based flag will be ignored if source-depth == target-depth, because this is anyway
@@ -139,16 +142,14 @@ def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
             if not inv_op:
                 return inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs
             else:
-                near_mask = torch.all(inv_nts.abs() < target_range / 2. + 1.0e-6, dim=1)
+                near_mask = torch.all(inv_nts.abs() < target_range / 2.0 + 1.0e-6, dim=1)
                 inv_nts = -inv_nts * 2 ** (source_depth - target_depth)
                 return inv_tgt_ids[near_mask], inv_src_ids[near_mask], inv_nts[near_mask], None
 
         # Only compute incremental part:
         neighbour_kernel = self._range_kernel()(target_range)
         source_ids, target_ids, neighbour_types, nbsizes = self.get_coords_neighbours(
-            self._indexes[source_depth].ijk.jdata,
-            2 ** source_depth,
-            target_depth, neighbour_kernel, conv_based
+            self._indexes[source_depth].ijk.jdata, 2**source_depth, target_depth, neighbour_kernel, conv_based
         )
 
         return recover_inv_op(source_ids, target_ids, neighbour_types, nbsizes)
@@ -159,12 +160,13 @@ def evaluate_voxel_status(self, coords: torch.Tensor, depth: int):
     def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
         raise NotImplementedError
 
-    def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1,
-                           compute_grad: bool = False):
+    def _trilinear_weights(
+        self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1, compute_grad: bool = False
+    ):
         # Gradient is alpha_data w.r.t. xyz.
         q_coords = xyz / self._voxel_size
         d_coords = (q_coords / tree_stride).floor() * tree_stride
-        rel_coords = q_coords - d_coords - tree_stride / 2.
+        rel_coords = q_coords - d_coords - tree_stride / 2.0
         oct_sign = torch.sign(rel_coords)
         oct_local = torch.abs(rel_coords) / tree_stride
 
@@ -172,10 +174,12 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
         alpha_data = []
         grad_alpha_data = []
         for nx, ny, nz in self.CONFORM_OFFSETS:
-            alpha_coords.append((d_coords + torch.stack([nx * oct_sign[:, 0],
-                                                         ny * oct_sign[:, 1],
-                                                         nz * oct_sign[:, 2]],
-                                                        dim=1) * tree_stride).int())
+            alpha_coords.append(
+                (
+                    d_coords
+                    + torch.stack([nx * oct_sign[:, 0], ny * oct_sign[:, 1], nz * oct_sign[:, 2]], dim=1) * tree_stride
+                ).int()
+            )
             alpha_x = oct_local[:, 0] if nx == 1 else 1 - oct_local[:, 0]
             alpha_y = oct_local[:, 1] if ny == 1 else 1 - oct_local[:, 1]
             alpha_z = oct_local[:, 2] if nz == 1 else 1 - oct_local[:, 2]
@@ -186,14 +190,16 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
                 d_alpha_x = (oct_sign[:, 0] if nx == 1 else -oct_sign[:, 0]) / (self._voxel_size * tree_stride)
                 d_alpha_y = (oct_sign[:, 1] if ny == 1 else -oct_sign[:, 1]) / (self._voxel_size * tree_stride)
                 d_alpha_z = (oct_sign[:, 2] if nz == 1 else -oct_sign[:, 2]) / (self._voxel_size * tree_stride)
-                grad_alpha_data.append(torch.stack([
-                    d_alpha_x * alpha_y * alpha_z,
-                    alpha_x * d_alpha_y * alpha_z,
-                    alpha_x * alpha_y * d_alpha_z
-                ], dim=1))
-
-            alpha_data.append(alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else
-                              alpha_os[:, None] * xyz_data)
+                grad_alpha_data.append(
+                    torch.stack(
+                        [d_alpha_x * alpha_y * alpha_z, alpha_x * d_alpha_y * alpha_z, alpha_x * alpha_y * d_alpha_z],
+                        dim=1,
+                    )
+                )
+
+            alpha_data.append(
+                alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else alpha_os[:, None] * xyz_data
+            )
         alpha_coords = torch.cat(alpha_coords, dim=0)
         alpha_data = torch.cat(alpha_data, dim=0)
 
@@ -205,8 +211,14 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
     def _identity_kernel(self):
         return torch.tensor([[0, 0, 0]], dtype=torch.int32, device=self._device)
 
-    def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = None,
-                   check_corr: bool = True, return_nf_mask: bool = False):
+    def splat_data(
+        self,
+        xyz: torch.Tensor,
+        data_depth: int,
+        data: torch.Tensor = None,
+        check_corr: bool = True,
+        return_nf_mask: bool = False,
+    ):
         """
         Splat the data onto the tree with tri-linear interpolation.
         :param xyz: data position
@@ -219,19 +231,23 @@ def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = No
         else:
             data = 1
 
-        tree_stride = 2 ** data_depth
+        tree_stride = 2**data_depth
         alpha_coords, alpha_data = self._trilinear_weights(xyz, tree_stride, data)
 
         # align normal_coords and tree_coords.
         alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True)
+            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True
+        )
 
         # Make sure that each query coordinates has one correspondent:
         if alpha_source.size(0) < alpha_coords.size(0) and check_corr:
-            print("Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
-                  f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}.")
-        splat_res = torch_scatter.scatter_sum(alpha_data[alpha_source], alpha_target, dim=0,
-                                              dim_size=self.get_num_voxels(data_depth))
+            print(
+                "Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
+                f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}."
+            )
+        splat_res = torch_scatter.scatter_sum(
+            alpha_data[alpha_source], alpha_target, dim=0, dim_size=self.get_num_voxels(data_depth)
+        )
         if return_nf_mask:
             # If a point can only be splatted on to less than 4 voxels, it is a bad splat.
             return splat_res, nb_sizes.reshape(8, -1).sum(0) < 4
@@ -240,13 +256,24 @@ def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = No
     def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
         raise NotImplementedError
 
-    def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand: bool = False,
-                                  limit_adaptive_depth: int = 100, **policy_kwargs):
+    def build_hierarchy_subdivide(
+        self,
+        xyz: torch.Tensor,
+        subdivide_policy,
+        expand: bool = False,
+        limit_adaptive_depth: int = 100,
+        **policy_kwargs,
+    ):
         raise NotImplementedError
 
-    def build_hierarchy_adaptive(self, xyz: torch.Tensor, xyz_density: torch.Tensor, log_base: float = 4.0,
-                                 min_density: float = 8.0,
-                                 limit_adaptive_depth: int = 100):
+    def build_hierarchy_adaptive(
+        self,
+        xyz: torch.Tensor,
+        xyz_density: torch.Tensor,
+        log_base: float = 4.0,
+        min_density: float = 8.0,
+        limit_adaptive_depth: int = 100,
+    ):
         raise NotImplementedError
 
     def update_coords(self, depth: int, coords: torch.Tensor):
@@ -259,10 +286,16 @@ def update_coords(self, depth: int, coords: torch.Tensor):
         permutation[coords_idx.jdata] = torch.arange(coords.size(0), dtype=torch.long, device=self._device)
         return coords[permutation], permutation
 
-    def trilinear_interpolate(self, queries: torch.Tensor, depth: int, feature: torch.Tensor,
-                              feature_bg: torch.Tensor = None, compute_grad: bool = False):
+    def trilinear_interpolate(
+        self,
+        queries: torch.Tensor,
+        depth: int,
+        feature: torch.Tensor,
+        feature_bg: torch.Tensor = None,
+        compute_grad: bool = False,
+    ):
         raise NotImplementedError
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     pass
diff --git a/fvdb/tests/unit/nkfw_api/backend/hash_table.py b/fvdb/tests/unit/nkfw_api/backend/hash_table.py
index 1da12e11c4..d00a797ab3 100644
--- a/fvdb/tests/unit/nkfw_api/backend/hash_table.py
+++ b/fvdb/tests/unit/nkfw_api/backend/hash_table.py
@@ -13,8 +13,13 @@
 print("SparseFeatureHierarchy Backend: Hash Table")
 
 
-def torch_unique(input: torch.Tensor, sorted: bool = False, return_inverse: bool = False,
-                 return_counts: bool = False, dim: int = None):
+def torch_unique(
+    input: torch.Tensor,
+    sorted: bool = False,
+    return_inverse: bool = False,
+    return_counts: bool = False,
+    dim: int = None,
+):
     """
     If used with dim, then torch.unique will return a flattened tensor. This fixes that behaviour.
     :param input: (Tensor) – the input tensor
@@ -80,8 +85,7 @@ def update_map(self, source_depth: int, target_depth: int, target_range: int, re
 
 class SparseFeatureHierarchy(BaseBackend):
 
-    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1),
-                       (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
+    CONFORM_OFFSETS = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 0, 0), (1, 0, 1), (1, 1, 0), (1, 1, 1)]
 
     def __init__(self, depth: int, voxel_size: float, device, range_kernel):
         self._depth = depth
@@ -94,7 +98,7 @@ def __init__(self, depth: int, voxel_size: float, device, range_kernel):
         # Region-based exclude same-level
         self._region_nmap = NeighbourMaps(self._device)
 
-        self._strides = [2 ** d for d in range(self.depth)]
+        self._strides = [2**d for d in range(self.depth)]
         # List of torch.Tensor (Nx3)
         self._coords = [None for d in range(self.depth)]
         self._hash_table: List[CuckooHashTable] = [None for d in range(self.depth)]
@@ -116,17 +120,16 @@ def get_coords(self, depth: int, expand: int = 0, conforming: bool = False):
 
         if expand >= 3:
             mc_offsets = self._range_kernel()(expand) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) +
-                           mc_offsets.unsqueeze(0)).view(-1, 3)
+            base_coords = (
+                base_coords.unsqueeze(dim=1).repeat(1, mc_offsets.size(0), 1) + mc_offsets.unsqueeze(0)
+            ).view(-1, 3)
             base_coords = torch_unique(base_coords, dim=0)
 
         if conforming:
-            base_coords = (base_coords / scale / 2.).floor().int() * scale * 2
+            base_coords = (base_coords / scale / 2.0).floor().int() * scale * 2
             base_coords = torch_unique(base_coords, dim=0)
-            conform_offsets = torch.tensor(
-                self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
-            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) +
-                           conform_offsets.unsqueeze(0)).view(-1, 3)
+            conform_offsets = torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=base_coords.device) * scale
+            base_coords = (base_coords.unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
 
         return base_coords
 
@@ -134,7 +137,7 @@ def get_num_voxels(self, depth: int):
         return self._coords[depth].size(0) if self._coords[depth] is not None else 0
 
     def get_voxel_centers(self, depth: int, normalized: bool = False):
-        return (self.get_coords(depth) + self._strides[depth] / 2.) * (self._voxel_size if not normalized else 1.0)
+        return (self.get_coords(depth) + self._strides[depth] / 2.0) * (self._voxel_size if not normalized else 1.0)
 
     def __repr__(self):
         stat = f"Depth={self.depth}:\n"
@@ -144,8 +147,10 @@ def __repr__(self):
                 continue
             c_min = torch.min(coords, dim=0).values
             c_max = torch.max(coords, dim=0).values
-            stat += f" + [{stride}] #Voxels={coords.size(0)} " \
-                    f"Bound=[{c_min[0]},{c_max[0]}]x[{c_min[1]},{c_max[1]}]x[{c_min[2]},{c_max[2]}]\n"
+            stat += (
+                f" + [{stride}] #Voxels={coords.size(0)} "
+                f"Bound=[{c_min[0]},{c_max[0]}]x[{c_min[1]},{c_max[1]}]x[{c_min[2]},{c_max[2]}]\n"
+            )
         return stat
 
     def _update_hash_table(self):
@@ -153,12 +158,13 @@ def _update_hash_table(self):
             self._hash_table[d] = CuckooHashTable(data=self._coords[d])
             assert self._hash_table[d].dim == 3
 
-    def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1,
-                           compute_grad: bool = False):
+    def _trilinear_weights(
+        self, xyz: torch.Tensor, tree_stride: int, xyz_data: torch.Tensor = 1, compute_grad: bool = False
+    ):
         # Gradient is alpha_data w.r.t. xyz.
         q_coords = xyz / self._voxel_size
         d_coords = (q_coords / tree_stride).floor() * tree_stride
-        rel_coords = q_coords - d_coords - tree_stride / 2.
+        rel_coords = q_coords - d_coords - tree_stride / 2.0
         oct_sign = torch.sign(rel_coords)
         oct_local = torch.abs(rel_coords) / tree_stride
 
@@ -166,10 +172,12 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
         alpha_data = []
         grad_alpha_data = []
         for nx, ny, nz in self.CONFORM_OFFSETS:
-            alpha_coords.append((d_coords + torch.stack([nx * oct_sign[:, 0],
-                                                         ny * oct_sign[:, 1],
-                                                         nz * oct_sign[:, 2]],
-                                                        dim=1) * tree_stride).int())
+            alpha_coords.append(
+                (
+                    d_coords
+                    + torch.stack([nx * oct_sign[:, 0], ny * oct_sign[:, 1], nz * oct_sign[:, 2]], dim=1) * tree_stride
+                ).int()
+            )
             alpha_x = oct_local[:, 0] if nx == 1 else 1 - oct_local[:, 0]
             alpha_y = oct_local[:, 1] if ny == 1 else 1 - oct_local[:, 1]
             alpha_z = oct_local[:, 2] if nz == 1 else 1 - oct_local[:, 2]
@@ -180,14 +188,16 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
                 d_alpha_x = (oct_sign[:, 0] if nx == 1 else -oct_sign[:, 0]) / (self._voxel_size * tree_stride)
                 d_alpha_y = (oct_sign[:, 1] if ny == 1 else -oct_sign[:, 1]) / (self._voxel_size * tree_stride)
                 d_alpha_z = (oct_sign[:, 2] if nz == 1 else -oct_sign[:, 2]) / (self._voxel_size * tree_stride)
-                grad_alpha_data.append(torch.stack([
-                    d_alpha_x * alpha_y * alpha_z,
-                    alpha_x * d_alpha_y * alpha_z,
-                    alpha_x * alpha_y * d_alpha_z
-                ], dim=1))
-
-            alpha_data.append(alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else
-                              alpha_os[:, None] * xyz_data)
+                grad_alpha_data.append(
+                    torch.stack(
+                        [d_alpha_x * alpha_y * alpha_z, alpha_x * d_alpha_y * alpha_z, alpha_x * alpha_y * d_alpha_z],
+                        dim=1,
+                    )
+                )
+
+            alpha_data.append(
+                alpha_os * xyz_data if isinstance(xyz_data, int) or xyz_data.ndim == 1 else alpha_os[:, None] * xyz_data
+            )
         alpha_coords = torch.cat(alpha_coords, dim=0)
         alpha_data = torch.cat(alpha_data, dim=0)
 
@@ -196,8 +206,16 @@ def _trilinear_weights(self, xyz: torch.Tensor, tree_stride: int, xyz_data: torc
 
         return alpha_coords, alpha_data
 
-    def get_coords_neighbours(self, source_coords: torch.Tensor, source_stride: int, target_depth: int,
-                              nn_kernel: torch.Tensor, conv_based: bool = False, transposed: bool = False, raw: bool = False):
+    def get_coords_neighbours(
+        self,
+        source_coords: torch.Tensor,
+        source_stride: int,
+        target_depth: int,
+        nn_kernel: torch.Tensor,
+        conv_based: bool = False,
+        transposed: bool = False,
+        raw: bool = False,
+    ):
         """
         A generic interface for querying neighbourhood information. (This is without cache)
             For all source (data), find all target whose neighbourhood (in target level) covers it,
@@ -212,18 +230,23 @@ def get_coords_neighbours(self, source_coords: torch.Tensor, source_stride: int,
             #   neighbours that has no overlap support.
             assert source_stride <= self._strides[target_depth], "Data must be deeper and has more nodes."
             # Compute voxel center offsets.
-            quantized_source_coords = torch.div(
-                source_coords.detach() + 0.5 * source_stride, self._strides[target_depth],
-                rounding_mode='floor').int() * self._strides[target_depth]
-            c_offset = (quantized_source_coords - source_coords) / source_stride + \
-                       (self._strides[target_depth] // source_stride - 1) / 2.
+            quantized_source_coords = (
+                torch.div(
+                    source_coords.detach() + 0.5 * source_stride, self._strides[target_depth], rounding_mode="floor"
+                ).int()
+                * self._strides[target_depth]
+            )
+            c_offset = (quantized_source_coords - source_coords) / source_stride + (
+                self._strides[target_depth] // source_stride - 1
+            ) / 2.0
         else:
             assert not source_coords.requires_grad
             assert source_stride >= self._strides[target_depth], "Data must be sparser and shallower."
             quantized_source_coords = source_coords
 
         hash_res = self._hash_table[target_depth].query(
-            quantized_source_coords, nn_kernel * self._strides[target_depth])  # (K, N)
+            quantized_source_coords, nn_kernel * self._strides[target_depth]
+        )  # (K, N)
 
         if transposed:
             hash_res = hash_res.T
@@ -249,8 +272,7 @@ def get_coords_neighbours(self, source_coords: torch.Tensor, source_stride: int,
 
         return source_ids, target_ids, neighbour_types, nbsizes
 
-    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int,
-                            conv_based: bool = False):
+    def get_self_neighbours(self, source_depth: int, target_depth: int, target_range: int, conv_based: bool = False):
         """
         :param source_depth: source depth where you want the coord id to start from
         :param target_depth: target depth where you want the coord id to shoot to
@@ -277,13 +299,14 @@ def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
                 return inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs
             else:
                 # Filter far away nodes.
-                near_mask = torch.all(inv_nts.abs() < target_range / 2. + 1.0e-6, dim=1)
+                near_mask = torch.all(inv_nts.abs() < target_range / 2.0 + 1.0e-6, dim=1)
                 # Convert back neighbour types.
                 inv_nts = -inv_nts / tree_strides[target_depth] * tree_strides[source_depth]
                 return inv_tgt_ids[near_mask], inv_src_ids[near_mask], inv_nts[near_mask], None
 
-        exist_src, exist_tgt, exist_nt, exist_nbs, lack_range = \
-            neighbour_maps.get_map(source_depth, target_depth, target_range)
+        exist_src, exist_tgt, exist_nt, exist_nbs, lack_range = neighbour_maps.get_map(
+            source_depth, target_depth, target_range
+        )
 
         if lack_range is None:
             return recover_inv_op(exist_src, exist_tgt, exist_nt, exist_nbs)
@@ -291,7 +314,7 @@ def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
         # Only compute incremental part:
         neighbour_kernel = self._range_kernel()(target_range)
         starting_lap = max(0, lack_range[0] - 2)
-        starting_lap = starting_lap ** 3
+        starting_lap = starting_lap**3
         neighbour_kernel = neighbour_kernel[starting_lap:]
 
         source_ids, target_ids, neighbour_types, nbsizes = self.get_coords_neighbours(
@@ -305,8 +328,9 @@ def recover_inv_op(inv_src_ids, inv_tgt_ids, inv_nts, inv_nbs):
             nbsizes = torch.cat([exist_nbs, nbsizes], dim=0)
 
         # Cache result for future use.
-        neighbour_maps.update_map(source_depth, target_depth, target_range,
-                                  [source_ids, target_ids, neighbour_types, nbsizes])
+        neighbour_maps.update_map(
+            source_depth, target_depth, target_range, [source_ids, target_ids, neighbour_types, nbsizes]
+        )
 
         return recover_inv_op(source_ids, target_ids, neighbour_types, nbsizes)
 
@@ -318,19 +342,23 @@ def evaluate_voxel_status(self, coords: torch.Tensor, depth: int):
         :return: (N, ) long tensor, with value 0,1,2
         """
         from core.hashtree import VoxelStatus
+
         status = torch.full((coords.size(0),), VoxelStatus.VS_NON_EXIST.value, dtype=torch.long, device=coords.device)
         sidx, _, _, _ = self.get_coords_neighbours(
-            coords, self._strides[depth], depth, self._identity_kernel(), conv_based=True)
+            coords, self._strides[depth], depth, self._identity_kernel(), conv_based=True
+        )
         status[sidx] = VoxelStatus.VS_EXIST_STOP.value
 
         if depth > 0:
             # Next level.
-            conform_offsets = torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=self._device) * \
-                              self._strides[depth - 1]
+            conform_offsets = (
+                torch.tensor(self.CONFORM_OFFSETS, dtype=torch.int32, device=self._device) * self._strides[depth - 1]
+            )
             conform_coords = (coords[sidx].unsqueeze(dim=1).repeat(1, 8, 1) + conform_offsets.unsqueeze(0)).view(-1, 3)
             qidx, _, _, _ = self.get_coords_neighbours(
-                conform_coords, self._strides[depth - 1], depth - 1, self._identity_kernel(), conv_based=True)
-            qidx = torch.div(qidx, 8, rounding_mode='floor')
+                conform_coords, self._strides[depth - 1], depth - 1, self._identity_kernel(), conv_based=True
+            )
+            qidx = torch.div(qidx, 8, rounding_mode="floor")
             status[sidx[qidx]] = VoxelStatus.VS_EXIST_CONTINUE.value
 
         return status
@@ -349,13 +377,23 @@ def split_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor):
 
         alpha_coords, alpha_weight = self._trilinear_weights(xyz, tree_stride)
         alpha_source, alpha_target, _, _ = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel())
-        return torch_scatter.scatter_sum(data[alpha_target] * alpha_weight[alpha_source, None],
-                                         alpha_source % xyz.size(0), dim=0,
-                                         dim_size=xyz.size(0))
+            alpha_coords, tree_stride, data_depth, self._identity_kernel()
+        )
+        return torch_scatter.scatter_sum(
+            data[alpha_target] * alpha_weight[alpha_source, None],
+            alpha_source % xyz.size(0),
+            dim=0,
+            dim_size=xyz.size(0),
+        )
 
-    def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = None,
-                   check_corr: bool = True, return_nf_mask: bool = False):
+    def splat_data(
+        self,
+        xyz: torch.Tensor,
+        data_depth: int,
+        data: torch.Tensor = None,
+        check_corr: bool = True,
+        return_nf_mask: bool = False,
+    ):
         """
         Splat the data onto the tree with tri-linear interpolation.
         :param xyz: data position
@@ -373,14 +411,18 @@ def splat_data(self, xyz: torch.Tensor, data_depth: int, data: torch.Tensor = No
 
         # align normal_coords and tree_coords.
         alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True)
+            alpha_coords, tree_stride, data_depth, self._identity_kernel(), transposed=True
+        )
 
         # Make sure that each query coordinates has one correspondent:
         if alpha_source.size(0) < alpha_coords.size(0) and check_corr:
-            print("Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
-                  f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}.")
-        splat_res = torch_scatter.scatter_sum(alpha_data[alpha_source], alpha_target, dim=0,
-                                         dim_size=self._coords[data_depth].size(0))
+            print(
+                "Warning: Some grids that normal should be splatted onto is missing because expansion is too small. "
+                f"# Should = {alpha_coords.size(0)}, Actual = {alpha_source.size(0)}."
+            )
+        splat_res = torch_scatter.scatter_sum(
+            alpha_data[alpha_source], alpha_target, dim=0, dim_size=self._coords[data_depth].size(0)
+        )
         if return_nf_mask:
             # If a point can only be splatted on to less than 4 voxels, it is a bad splat.
             return splat_res, nb_sizes.reshape(8, -1).sum(0) < 4
@@ -404,19 +446,26 @@ def build_hierarchy_dense(self, xyz: torch.Tensor, expand_range: int = 0):
             unique_coords = torch_unique(coords, dim=0)
             if expand_range > 0:
                 offsets = self._range_kernel()(expand_range)
-                my_pad = (unique_coords.unsqueeze(dim=1).repeat(1, offsets.size(0), 1) +
-                          offsets.unsqueeze(0)).view(-1, 3)
+                my_pad = (unique_coords.unsqueeze(dim=1).repeat(1, offsets.size(0), 1) + offsets.unsqueeze(0)).view(
+                    -1, 3
+                )
                 unique_coords = torch_unique(my_pad, dim=0)
 
         self._coords = [unique_coords]
         for d in range(1, self.depth):
-            coords = torch.div(self._coords[-1], self._strides[d], rounding_mode='floor') * self._strides[d]
+            coords = torch.div(self._coords[-1], self._strides[d], rounding_mode="floor") * self._strides[d]
             coords = torch_unique(coords, dim=0)
             self._coords.append(coords)
         self._update_hash_table()
 
-    def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand: bool = False,
-                                  limit_adaptive_depth: int = 100, **policy_kwargs):
+    def build_hierarchy_subdivide(
+        self,
+        xyz: torch.Tensor,
+        subdivide_policy,
+        expand: bool = False,
+        limit_adaptive_depth: int = 100,
+        **policy_kwargs,
+    ):
         """
         Build a hierarchy, based on subdivision policy
         :return:
@@ -433,7 +482,7 @@ def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand:
                 xyz_depth_inds = xyz_depth_inds[nxt_mask]
                 policy_kwargs = {k: v[nxt_mask] if isinstance(v, torch.Tensor) else v for k, v in policy_kwargs.items()}
                 xyz_depth[xyz_depth_inds] -= 1
-            coords = torch.div(current_pts, self.get_stride(d), rounding_mode='floor').int() * self._strides[d]
+            coords = torch.div(current_pts, self.get_stride(d), rounding_mode="floor").int() * self._strides[d]
             unique_coords, inv_mapping = torch_unique(coords, dim=0, return_inverse=True)
             self._coords[d] = unique_coords
         xyz_depth.clamp_(max=limit_adaptive_depth - 1)
@@ -450,9 +499,14 @@ def build_hierarchy_subdivide(self, xyz: torch.Tensor, subdivide_policy, expand:
 
         return xyz_depth
 
-    def build_hierarchy_adaptive(self, xyz: torch.Tensor, xyz_density: torch.Tensor, log_base: float = 4.0,
-                                 min_density: float = 8.0,
-                                 limit_adaptive_depth: int = 100):
+    def build_hierarchy_adaptive(
+        self,
+        xyz: torch.Tensor,
+        xyz_density: torch.Tensor,
+        log_base: float = 4.0,
+        min_density: float = 8.0,
+        limit_adaptive_depth: int = 100,
+    ):
         """
         Build a hierarchy similar to Adaptive-OCNN, i.e., finest voxels does not cover all surfaces,
             but only detailed parts. Coarse voxels, however, must cover all fine voxels.
@@ -509,25 +563,34 @@ def _trilerp_light(self, queries: torch.Tensor, depth: int, feature: torch.Tenso
 
         # For the logic here refer to 'splat_data'
         alpha_source, alpha_target, _, nb_sizes = self.get_coords_neighbours(
-            alpha_coords, self._strides[depth], depth, self._identity_kernel(), transposed=False)
+            alpha_coords, self._strides[depth], depth, self._identity_kernel(), transposed=False
+        )
 
         pts_source = alpha_source % queries.size(0)
         depth_feature = torch_scatter.scatter_sum(
-            feature[alpha_target] * alpha_weight[alpha_source, None],
-            pts_source, dim=0, dim_size=queries.size(0))
+            feature[alpha_target] * alpha_weight[alpha_source, None], pts_source, dim=0, dim_size=queries.size(0)
+        )
 
         if compute_grad:
             depth_grad = torch_scatter.scatter_sum(
                 feature[alpha_target][:, :, None] * grad_alpha_weight[alpha_source, None, :],
-                pts_source, dim=0, dim_size=queries.size(0)
+                pts_source,
+                dim=0,
+                dim_size=queries.size(0),
             )
         else:
             depth_grad = None
 
         return depth_feature, depth_grad
 
-    def trilinear_interpolate(self, queries: torch.Tensor, depth: int, feature: torch.Tensor,
-                              feature_bg: torch.Tensor = None, compute_grad: bool = False):
+    def trilinear_interpolate(
+        self,
+        queries: torch.Tensor,
+        depth: int,
+        feature: torch.Tensor,
+        feature_bg: torch.Tensor = None,
+        compute_grad: bool = False,
+    ):
         if feature_bg is not None:
             assert feature_bg.ndim == 1
             assert feature.size(1) == feature_bg.size(0), "Dimension not matched!"
@@ -537,9 +600,10 @@ def trilinear_interpolate(self, queries: torch.Tensor, depth: int, feature: torc
             pass
 
         from ext import sparse_op
+
         nb_ids, nb_weight, nb_grad = sparse_op.trilerp(
-            self._hash_table[depth].object,
-            queries, self.voxel_size, self._strides[depth], compute_grad)
+            self._hash_table[depth].object, queries, self.voxel_size, self._strides[depth], compute_grad
+        )
 
         nb_ids = nb_ids.view(-1)
         nb_weight = nb_weight.view(-1)
@@ -547,28 +611,33 @@ def trilinear_interpolate(self, queries: torch.Tensor, depth: int, feature: torc
 
         nb_mask = nb_ids > -1
         depth_feature = torch_scatter.scatter_sum(
-            feature[nb_ids[nb_mask]] * nb_weight[nb_mask, None], pts_ids[nb_mask],
-            dim=0, dim_size=queries.size(0)
+            feature[nb_ids[nb_mask]] * nb_weight[nb_mask, None], pts_ids[nb_mask], dim=0, dim_size=queries.size(0)
         )
 
         if feature_bg is not None:
             non_nb_mask = nb_ids == -1
             depth_feature += torch_scatter.scatter_sum(
-                feature_bg[None, :] * nb_weight[non_nb_mask, None], pts_ids[non_nb_mask],
-                dim=0, dim_size=queries.size(0)
+                feature_bg[None, :] * nb_weight[non_nb_mask, None],
+                pts_ids[non_nb_mask],
+                dim=0,
+                dim_size=queries.size(0),
             )
 
         if compute_grad:
             nb_grad = nb_grad.view(-1, nb_grad.size(-1))
             depth_grad = torch_scatter.scatter_sum(
                 feature[nb_ids[nb_mask]][:, :, None] * nb_grad[nb_mask, None, :],
-                pts_ids[nb_mask], dim=0, dim_size=queries.size(0)
+                pts_ids[nb_mask],
+                dim=0,
+                dim_size=queries.size(0),
             )
             # Most of nb_grad[non_nb_mask] should be zero though...
             if feature_bg is not None:
                 depth_grad += torch_scatter.scatter_sum(
                     feature_bg[None, :, None] * nb_grad[non_nb_mask, None, :],
-                    pts_ids[non_nb_mask], dim=0, dim_size=queries.size(0)
+                    pts_ids[non_nb_mask],
+                    dim=0,
+                    dim_size=queries.size(0),
                 )
         else:
             depth_grad = None
diff --git a/fvdb/tests/unit/nkfw_api/ext/__init__.py b/fvdb/tests/unit/nkfw_api/ext/__init__.py
index c1e1995cff..e19b4aa5fb 100644
--- a/fvdb/tests/unit/nkfw_api/ext/__init__.py
+++ b/fvdb/tests/unit/nkfw_api/ext/__init__.py
@@ -31,15 +31,13 @@ def path_should_keep(pth):
     return load(
         name="fvdb_test_" + name,
         sources=list(cpp_files) + list(cu_files) + [base_path / t for t in additional_files],
-        verbose='COMPILE_VERBOSE' in os.environ.keys(),
-        extra_ldflags = ["-L%s/lib" %os.environ.get("CONDA_PREFIX")] if os.environ.get("CONDA_PREFIX") else None,
-        **kwargs
+        verbose="COMPILE_VERBOSE" in os.environ.keys(),
+        extra_ldflags=["-L%s/lib" % os.environ.get("CONDA_PREFIX")] if os.environ.get("CONDA_PREFIX") else None,
+        **kwargs,
     )
 
 
-common = load_torch_extension(
-    'common', extra_cflags=['-O2'], extra_cuda_cflags=['-O2', '-Xcompiler -fno-gnu-unique']
-)
+common = load_torch_extension("common", extra_cflags=["-O2"], extra_cuda_cflags=["-O2", "-Xcompiler -fno-gnu-unique"])
 
 
 class CuckooHashTable:
@@ -50,18 +48,18 @@ def __init__(self, data: torch.Tensor = None, hashed_data: torch.Tensor = None,
             self.dim = data.size(1)
             source_hash = self._sphash(data)
         else:
-            self.dim = -1   # Never equals me.
+            self.dim = -1  # Never equals me.
             source_hash = hashed_data
         self.object = common.build_hash_table(source_hash, torch.tensor([]), enlarged)
 
     @classmethod
-    def _sphash(cls, coords: torch.Tensor, offsets=None) -> torch.Tensor:     # Int64
+    def _sphash(cls, coords: torch.Tensor, offsets=None) -> torch.Tensor:  # Int64
         assert coords.dtype in [torch.int, torch.long], coords.dtype
         coords = coords.contiguous()
         if offsets is None:
             assert coords.ndim == 2 and coords.shape[1] in [2, 3, 4], coords.shape
             if coords.size(0) == 0:
-                return torch.empty((coords.size(0), ), dtype=torch.int64, device=coords.device)
+                return torch.empty((coords.size(0),), dtype=torch.int64, device=coords.device)
             return common.hash_cuda(coords)
         else:
             assert offsets.dtype == torch.int, offsets.dtype
diff --git a/fvdb/tests/unit/test_accessors.py b/fvdb/tests/unit/test_accessors.py
index 106ce8813c..eda9d2ad03 100644
--- a/fvdb/tests/unit/test_accessors.py
+++ b/fvdb/tests/unit/test_accessors.py
@@ -28,9 +28,7 @@ def test_read_into_dense(self, device):
             device=device,
         )
 
-        sparse_points = torch.tensor(
-            [[0, 0, 0], [1, 1, 1]], dtype=torch.float16, device=device
-        )
+        sparse_points = torch.tensor([[0, 0, 0], [1, 1, 1]], dtype=torch.float16, device=device)
         grid = GridBatch(mutable=True, device=device)
         grid.set_from_points(sparse_points, voxel_sizes=0.1, origins=[0.0] * 3)
 
@@ -40,16 +38,13 @@ def test_read_into_dense(self, device):
     def test_read_from_dense(self, device):
         dense_origin = torch.tensor([0, 0, 0]).to(torch.long).to(device)
 
-        zero_points = torch.tensor(
-            [[0, 0, 0], [1, 1, 1]], dtype=torch.float16, device=device
-        )
+        zero_points = torch.tensor([[0, 0, 0], [1, 1, 1]], dtype=torch.float16, device=device)
         grid = GridBatch(mutable=True, device=device)
         grid.set_from_points(zero_points, voxel_sizes=0.1, origins=[0.0] * 3)
 
         sparse_data = torch.tensor([[0], [0]], dtype=torch.float16, device=device)
-        grid.read_into_dense(
-            sparse_data, dense_origin, (RESOLUTION, RESOLUTION, RESOLUTION)
-        )
+        grid.read_into_dense(sparse_data, dense_origin, (RESOLUTION, RESOLUTION, RESOLUTION))
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_basic_ops.py b/fvdb/tests/unit/test_basic_ops.py
index 9885c47165..b83452bf5a 100644
--- a/fvdb/tests/unit/test_basic_ops.py
+++ b/fvdb/tests/unit/test_basic_ops.py
@@ -12,25 +12,29 @@
 from fvdb import GridBatch, JaggedTensor
 import fvdb
 
-from .common import (make_dense_grid_and_point_data,
-                     make_sparse_grid_and_point_data,
-                     random_drop_points_if_mutable,
-                     test_expand, dtype_to_atol)
+from .common import (
+    make_dense_grid_and_point_data,
+    make_sparse_grid_and_point_data,
+    random_drop_points_if_mutable,
+    test_expand,
+    dtype_to_atol,
+)
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
-bfloat16_combos = [['cuda', torch.bfloat16, False], ['cuda', torch.bfloat16, True]]
+bfloat16_combos = [["cuda", torch.bfloat16, False], ["cuda", torch.bfloat16, True]]
+
 
 class TestBasicOps(unittest.TestCase):
     def setUp(self):
@@ -38,7 +42,7 @@ def setUp(self):
         #     os.path.realpath(__file__)), "..", "data")
         pass
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_joffsets_mutable(self, device):
         # This is a test for https://github.com/voxel-foundation/feature-vdb/issues/196
         grid = GridBatch(mutable=True, device=device)
@@ -49,7 +53,9 @@ def test_joffsets_mutable(self, device):
     @parameterized.expand(all_device_dtype_combos)
     def test_voxel_neighborhood(self, device, dtype, mutable):
         randvox = torch.randint(0, 256, size=(10_000, 3), dtype=torch.int32).to(device)
-        randvox = torch.cat([randvox, randvox + torch.ones(1, 3).to(randvox)], dim=0)  # Ensure there are always neighbors
+        randvox = torch.cat(
+            [randvox, randvox + torch.ones(1, 3).to(randvox)], dim=0
+        )  # Ensure there are always neighbors
 
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_ijk(randvox)
@@ -59,7 +65,7 @@ def test_voxel_neighborhood(self, device, dtype, mutable):
         for i in range(3):
             for j in range(3):
                 for k in range(3):
-                    off = torch.tensor([[i-1, j-1, k-1]]).to(randvox)
+                    off = torch.tensor([[i - 1, j - 1, k - 1]]).to(randvox)
                     nh_ijk = randvox + off
                     idx = grid.ijk_to_index(nh_ijk).jdata
                     mask = grid.coords_in_active_voxel(nh_ijk).jdata
@@ -77,15 +83,17 @@ def test_world_to_dual(self, device, dtype, mutable):
         pts = torch.randn(10000, 3).to(device=device, dtype=dtype)
 
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_points(pts, [-1]*3, [1]*3, vox_size, vox_origin)
+        grid.set_from_points(pts, [-1] * 3, [1] * 3, vox_size, vox_origin)
         grid = grid.dual_grid()
         random_drop_points_if_mutable(grid)
 
         target_dual_coordinates = ((pts - vox_origin) / vox_size) + 0.5
         pred_dual_coordinates = grid.world_to_grid(pts).jdata
 
-        self.assertTrue(torch.allclose(pred_dual_coordinates, target_dual_coordinates, atol=dtype_to_atol(dtype)),
-                        f"max_diff = {torch.abs(pred_dual_coordinates - target_dual_coordinates).max()}")
+        self.assertTrue(
+            torch.allclose(pred_dual_coordinates, target_dual_coordinates, atol=dtype_to_atol(dtype)),
+            f"max_diff = {torch.abs(pred_dual_coordinates - target_dual_coordinates).max()}",
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_world_to_primal(self, device, dtype, mutable):
@@ -95,14 +103,13 @@ def test_world_to_primal(self, device, dtype, mutable):
         pts = torch.randn(10000, 3).to(device=device, dtype=dtype)
 
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_points(pts, [-1]*3, [1]*3, vox_size, vox_origin)
+        grid.set_from_points(pts, [-1] * 3, [1] * 3, vox_size, vox_origin)
         random_drop_points_if_mutable(grid)
 
         target_primal_coordinates = (pts - vox_origin) / vox_size
         pred_primal_coordinates = grid.world_to_grid(pts).jdata
 
-        self.assertTrue(
-            torch.allclose(target_primal_coordinates, pred_primal_coordinates, atol=dtype_to_atol(dtype)))
+        self.assertTrue(torch.allclose(target_primal_coordinates, pred_primal_coordinates, atol=dtype_to_atol(dtype)))
 
     @parameterized.expand(all_device_dtype_combos)
     def test_world_to_dual_grad(self, device, dtype, mutable):
@@ -134,7 +141,6 @@ def test_world_to_dual_grad(self, device, dtype, mutable):
         self.assertTrue(torch.allclose(pred_dual_coordinates, target_dual_coordinates, atol=dtype_to_atol(dtype)))
         self.assertTrue(torch.allclose(pts.grad, pred_grad, atol=dtype_to_atol(dtype)))
 
-
     @parameterized.expand(all_device_dtype_combos)
     def test_world_to_primal_grad(self, device, dtype, mutable):
         vox_size = np.random.rand() * 0.1 + 0.05
@@ -158,12 +164,10 @@ def test_world_to_primal_grad(self, device, dtype, mutable):
         self.assertTrue(not torch.equal(pred_grad, torch.zeros_like(pred_grad)))
         self.assertTrue(torch.equal(pts.grad, torch.zeros_like(pts.grad)))
 
-        target_primal_coordinates = ((pts - vox_origin) / vox_size)
+        target_primal_coordinates = (pts - vox_origin) / vox_size
         target_primal_coordinates.backward(grad_out)
 
-        self.assertTrue(torch.allclose(
-            target_primal_coordinates, pred_primal_coordinates, atol=dtype_to_atol(dtype)
-        ))
+        self.assertTrue(torch.allclose(target_primal_coordinates, pred_primal_coordinates, atol=dtype_to_atol(dtype)))
         # diff_idxs = torch.where(~torch.isclose(pts.grad, pred_grad, atol=dtype_to_atol(dtype)))
         self.assertTrue(torch.allclose(pts.grad, pred_grad, atol=dtype_to_atol(dtype)))
 
@@ -263,7 +267,6 @@ def test_to_dual_to_world_grad(self, device, dtype, mutable):
         self.assertTrue(torch.allclose(target_world_pts, pred_world_pts, atol=dtype_to_atol(dtype)))
         self.assertTrue(torch.allclose(grid_pts.grad, pred_grad, atol=dtype_to_atol(dtype)))
 
-
     @parameterized.expand(all_device_dtype_combos)
     def test_dual_of_dual_is_primal(self, device, dtype, mutable):
         torch.random.manual_seed(0)
@@ -285,12 +288,13 @@ def test_dual_of_dual_is_primal(self, device, dtype, mutable):
         self.assertTrue(torch.all(primal_origin == grid_dd.origins[0]))
         self.assertTrue(torch.all(dual_origin == grid_dd.dual_grid().origins[0]))
 
-        target_primal_coordinates = ((pts - vox_origin) / vox_size)
+        target_primal_coordinates = (pts - vox_origin) / vox_size
         pred_primal_coordinates = grid.world_to_grid(pts).jdata
 
         self.assertTrue(
             torch.allclose(target_primal_coordinates, pred_primal_coordinates, atol=dtype_to_atol(dtype)),
-            f"Max diff = {torch.max(torch.abs(target_primal_coordinates- pred_primal_coordinates)).item()}")
+            f"Max diff = {torch.max(torch.abs(target_primal_coordinates- pred_primal_coordinates)).item()}",
+        )
 
         target_dual_coordinates = ((pts - vox_origin) / vox_size) + 0.5
         pred_dual_coordinates = grid_d.world_to_grid(pts).jdata
@@ -298,7 +302,8 @@ def test_dual_of_dual_is_primal(self, device, dtype, mutable):
 
         pred_primal_coordinates_dd = grid_dd.world_to_grid(pts).jdata
         self.assertTrue(
-            torch.allclose(target_primal_coordinates, pred_primal_coordinates_dd, atol=dtype_to_atol(dtype)))
+            torch.allclose(target_primal_coordinates, pred_primal_coordinates_dd, atol=dtype_to_atol(dtype))
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_ijk_to_index(self, device, dtype, mutable):
@@ -336,7 +341,7 @@ def test_ijk_to_index_batched(self, device, dtype, mutable):
         gsize = 7
 
         grid_p1, grid_d1, _ = make_dense_grid_and_point_data(gsize, device, dtype, mutable)
-        grid_p2, grid_d2, _ = make_dense_grid_and_point_data(gsize-2, device, dtype, mutable)
+        grid_p2, grid_d2, _ = make_dense_grid_and_point_data(gsize - 2, device, dtype, mutable)
 
         grid_p, grid_d = fvdb.jcat([grid_p1, grid_p2]), fvdb.jcat([grid_d1, grid_d2])
 
@@ -348,8 +353,12 @@ def test_ijk_to_index_batched(self, device, dtype, mutable):
             pidx = grid_p.ijk_to_index(grid_p.ijk)
             didx = grid_d.ijk_to_index(grid_d.ijk)
 
-            target_pidx = fvdb.JaggedTensor([torch.arange(n.item()).to(device=pidx.device, dtype=pidx.dtype) for n in grid_p.num_voxels])
-            target_didx = fvdb.JaggedTensor([torch.arange(n.item()).to(device=pidx.device, dtype=didx.dtype) for n in grid_d.num_voxels])
+            target_pidx = fvdb.JaggedTensor(
+                [torch.arange(n.item()).to(device=pidx.device, dtype=pidx.dtype) for n in grid_p.num_voxels]
+            )
+            target_didx = fvdb.JaggedTensor(
+                [torch.arange(n.item()).to(device=pidx.device, dtype=didx.dtype) for n in grid_d.num_voxels]
+            )
 
             self.assertTrue(torch.all(pidx.jdata == target_pidx.jdata))
             self.assertTrue(torch.all(didx.jdata == target_didx.jdata))
@@ -361,23 +370,26 @@ def test_ijk_to_index_batched(self, device, dtype, mutable):
             didx = grid_d.ijk_to_index(dijk[dpmt])
             # target_pidx = torch.arange(pidx.shape[0]).to(pidx)
             # target_didx = torch.arange(didx.shape[0]).to(didx)
-            target_pidx = fvdb.JaggedTensor([torch.arange(n.item()).to(device=pidx.device, dtype=pidx.dtype) for n in grid_p.num_voxels])
-            target_didx = fvdb.JaggedTensor([torch.arange(n.item()).to(device=pidx.device, dtype=didx.dtype) for n in grid_d.num_voxels])
+            target_pidx = fvdb.JaggedTensor(
+                [torch.arange(n.item()).to(device=pidx.device, dtype=pidx.dtype) for n in grid_p.num_voxels]
+            )
+            target_didx = fvdb.JaggedTensor(
+                [torch.arange(n.item()).to(device=pidx.device, dtype=didx.dtype) for n in grid_d.num_voxels]
+            )
 
             self.assertTrue(torch.all(pidx.jdata == target_pidx[ppmt].jdata))
             self.assertTrue(torch.all(didx.jdata == target_didx[dpmt].jdata))
 
-
     @parameterized.expand(all_device_dtype_combos)
     def test_coords_in_grid(self, device, _, mutable):
-        num_inside = 1000 if device == 'cpu' else 100_000
+        num_inside = 1000 if device == "cpu" else 100_000
         random_coords = torch.randint(-1024, 1024, (num_inside, 3), dtype=torch.int32).to(device)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_ijk(random_coords)
         random_drop_points_if_mutable(grid)
 
         enabled_coords = grid.ijk.jdata
-        num_outside = 1000 if device == 'cpu' else 10_000
+        num_outside = 1000 if device == "cpu" else 10_000
 
         outside_random_coords = torch.randint(2048, 4096, (num_outside, 3), dtype=torch.int32).to(device)
         inside_coords = enabled_coords[:num_inside]
@@ -392,14 +404,14 @@ def test_coords_in_grid(self, device, _, mutable):
 
     @parameterized.expand(all_device_dtype_combos)
     def test_points_in_grid(self, device, dtype, mutable):
-        num_inside = 1000 if device == 'cpu' else 100_000
+        num_inside = 1000 if device == "cpu" else 100_000
         random_coords = torch.randint(-1024, 1024, (num_inside, 3), dtype=torch.int32).to(device)
         grid = GridBatch(device, mutable)
         grid.set_from_ijk(random_coords)
         random_drop_points_if_mutable(grid)
 
         enabled_coords = grid.ijk.jdata
-        num_outside = 1000 if device == 'cpu' else 10_000
+        num_outside = 1000 if device == "cpu" else 10_000
         outside_random_coords = torch.randint(2048, 4096, (num_outside, 3), dtype=torch.int32).to(device)
         inside_coords = enabled_coords[:num_inside]
 
@@ -416,10 +428,10 @@ def test_points_in_grid(self, device, dtype, mutable):
     @parameterized.expand(all_device_dtype_combos)
     def test_cubes_intersect_grid(self, device, dtype, mutable):
         # TODO: (@Caenorst) tests are a bit too light, should test on more variety of range
-        #import random
+        # import random
         torch.random.manual_seed(0)
-        #random.seed(0)
-        #np.random.seed(0)
+        # random.seed(0)
+        # np.random.seed(0)
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, include_boundary_points=True, mutable=mutable)
         voxel_size = grid.voxel_sizes
@@ -475,7 +487,7 @@ def test_subdivided_grid(self, device, dtype, mutable):
         grids = [grid]
         for i in range(2):
             subdiv_factor = i + 2
-            mask = (torch.rand(grids[i].total_voxels, device=device) > 0.5)
+            mask = torch.rand(grids[i].total_voxels, device=device) > 0.5
 
             # This line sets false values in disabled voxels
             # This is not needed for the subdivision, but we use the mask to count how
@@ -483,7 +495,7 @@ def test_subdivided_grid(self, device, dtype, mutable):
             mask[grids[i].disabled_mask.jdata] = False
 
             grids.append(grids[-1].subdivided_grid(subdiv_factor, mask))
-            self.assertEqual(int(mask.sum().item()) * subdiv_factor ** 3, grids[-1].total_enabled_voxels)
+            self.assertEqual(int(mask.sum().item()) * subdiv_factor**3, grids[-1].total_enabled_voxels)
 
         grids = [grid]
         for i, subdiv_factor in enumerate([(2, 2, 1), (3, 2, 2), (1, 1, 3)]):
@@ -496,7 +508,7 @@ def test_subdivided_grid(self, device, dtype, mutable):
             nsubvox = subdiv_factor[0] * subdiv_factor[1] * subdiv_factor[2]
             grids.append(grids[-1].subdivided_grid(subdiv_factor, mask))
             self.assertEqual(int(mask.sum().item()) * nsubvox, grids[-1].total_enabled_voxels)
-        if device == 'cuda':
+        if device == "cuda":
             torch.cuda.synchronize()
 
     @parameterized.expand(all_device_dtype_combos)
@@ -512,33 +524,41 @@ def test_build_from_pointcloud_nearest_voxels(self, device, dtype, mutable):
             p = p.float()
 
         expected_ijk = torch.floor(grid.world_to_grid(p).jdata)
-        offsets = torch.tensor([
-            [0, 0, 0],
-            [0, 0, 1],
-            [0, 1, 0],
-            [0, 1, 1],
-            [1, 0, 0],
-            [1, 0, 1],
-            [1, 1, 0],
-            [1, 1, 1],
-        ], device=device, dtype=torch.long)
+        offsets = torch.tensor(
+            [
+                [0, 0, 0],
+                [0, 0, 1],
+                [0, 1, 0],
+                [0, 1, 1],
+                [1, 0, 0],
+                [1, 0, 1],
+                [1, 1, 0],
+                [1, 1, 1],
+            ],
+            device=device,
+            dtype=torch.long,
+        )
         expected_ijk = expected_ijk.unsqueeze(1) + offsets.unsqueeze(0)
         expected_ijk = expected_ijk.view(-1, 3).to(torch.int32)
 
         if mutable:
             expected_ijk = expected_ijk[grid.coords_in_active_voxel(expected_ijk).jdata]
 
-        expected_ijk_set = set({(expected_ijk[i, 0].item(),
-                                 expected_ijk[i, 1].item(),
-                                 expected_ijk[i, 2].item())
-                                 for i in range(expected_ijk.shape[0])})
+        expected_ijk_set = set(
+            {
+                (expected_ijk[i, 0].item(), expected_ijk[i, 1].item(), expected_ijk[i, 2].item())
+                for i in range(expected_ijk.shape[0])
+            }
+        )
 
         predicted_ijk = grid.ijk_enabled.jdata
 
-        predicted_ijk_set = set({(predicted_ijk[i, 0].item(),
-                                  predicted_ijk[i, 1].item(),
-                                  predicted_ijk[i, 2].item())
-                                  for i in range(predicted_ijk.shape[0])})
+        predicted_ijk_set = set(
+            {
+                (predicted_ijk[i, 0].item(), predicted_ijk[i, 1].item(), predicted_ijk[i, 2].item())
+                for i in range(predicted_ijk.shape[0])
+            }
+        )
 
         self.assertEqual(predicted_ijk_set, expected_ijk_set)
 
@@ -553,7 +573,7 @@ def test_subdivide(self, device, dtype, mutable):
                 fac_sub_one = torch.tensor([subdiv_factor]).to(device) - 1
                 subvec = torch.tensor(subdiv_factor).to(device)
             else:
-                nvoxsub = subdiv_factor ** 3
+                nvoxsub = subdiv_factor**3
                 fac_sub_one = subdiv_factor - 1
                 subvec = subdiv_factor
 
@@ -573,7 +593,9 @@ def test_subdivide(self, device, dtype, mutable):
 
             feats_fine, grid_fine = grid.subdivide(subdiv_factor, feats, mask=mask)
             self.assertTrue(torch.allclose(grid_fine.voxel_sizes[0], grid.voxel_sizes[0] / subvec))
-            self.assertTrue(torch.allclose(grid_fine.origins[0], grid.origins[0] - 0.5 * grid_fine.voxel_sizes[0] * fac_sub_one))
+            self.assertTrue(
+                torch.allclose(grid_fine.origins[0], grid.origins[0] - 0.5 * grid_fine.voxel_sizes[0] * fac_sub_one)
+            )
 
             fine_to_coarse_ijk = (grid_fine.ijk.jdata / subvec).floor()
             fine_to_coarse_idx = grid.ijk_to_index(fine_to_coarse_ijk.to(torch.int32)).jdata
@@ -607,7 +629,7 @@ def test_subdivide_with_mask(self, device, dtype, mutable):
                 fac_sub_one = torch.tensor([subdiv_factor]).to(device) - 1
                 subvec = torch.tensor(subdiv_factor).to(device)
             else:
-                nvoxsub = subdiv_factor ** 3
+                nvoxsub = subdiv_factor**3
                 fac_sub_one = subdiv_factor - 1
                 subvec = subdiv_factor
 
@@ -626,7 +648,9 @@ def test_subdivide_with_mask(self, device, dtype, mutable):
 
             feats_fine, grid_fine = grid.subdivide(subdiv_factor, feats, mask=mask)
             self.assertTrue(torch.allclose(grid_fine.voxel_sizes[0], grid.voxel_sizes[0] / subvec))
-            self.assertTrue(torch.allclose(grid_fine.origins[0], grid.origins[0] - 0.5 * grid_fine.voxel_sizes[0] * fac_sub_one))
+            self.assertTrue(
+                torch.allclose(grid_fine.origins[0], grid.origins[0] - 0.5 * grid_fine.voxel_sizes[0] * fac_sub_one)
+            )
 
             fine_to_coarse_ijk = (grid_fine.ijk.jdata / subvec).floor()
             fine_to_coarse_idx = grid.ijk_to_index(fine_to_coarse_ijk.to(torch.int32)).jdata
@@ -661,7 +685,7 @@ def test_max_pool(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         grid_vals = torch.randn(grid.total_voxels, 3).to(device).to(dtype)
 
         for pool_factor in ((2, 3, 1), 1, 2, 3, 4, 5, 7, 15, 10):
@@ -669,15 +693,28 @@ def test_max_pool(self, device, dtype, mutable):
             grid_vals_coarse = grid_vals_coarse.jdata
             if isinstance(pool_factor, int):
                 self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * pool_factor))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)))
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)
+                    )
+                )
             else:
-                self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device)))
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)
+                    )
+                )
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0],
+                        grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device),
+                    )
+                )
 
             # Pytorch pooling
             torch_pool_op = torch.nn.MaxPool3d(pool_factor, pool_factor, ceil_mode=True)
             # We compy everything to the CPU because it's noticeably faster to iterate and copy this way
-            grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device='cpu', dtype=dtype)
+            grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device="cpu", dtype=dtype)
             grid_ijk_cpu = grid.ijk.jdata.cpu()
             grid_vals_cpu = grid_vals.cpu()
             for i, coord in enumerate(grid_ijk_cpu):
@@ -686,7 +723,7 @@ def test_max_pool(self, device, dtype, mutable):
             grid_vals_t = grid_vals_t.permute(3, 0, 1, 2).contiguous()
             grid_vals_t_coarse = torch_pool_op(grid_vals_t.unsqueeze(0)).squeeze()
 
-            grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device='cpu')
+            grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device="cpu")
             grid_coarse_ijk_cpu = grid_coarse.ijk.jdata.cpu()
             for i, coord in enumerate(grid_coarse_ijk_cpu):
                 grid_vals_coarse_t_flat[i] = grid_vals_t_coarse[:, coord[0], coord[1], coord[2]]
@@ -700,33 +737,48 @@ def test_strided_max_pool(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         grid_vals = torch.randn(grid.total_voxels, 3).to(device).to(dtype)
 
         for pool_factor in ((2, 3, 4), 2, 4, 5, 10):
             # Our behavior differs slightly from PyTorch when pool_factor < stride, so only test this.
             if isinstance(pool_factor, int):
-                pools = (pool_factor, pool_factor+1, pool_factor + 2, pool_factor + 5)
+                pools = (pool_factor, pool_factor + 1, pool_factor + 2, pool_factor + 5)
             else:
                 assert isinstance(pool_factor, tuple)
+
                 def addit(pf, val_):
                     assert isinstance(pf, tuple)
                     return (pf[0] + val_, pf[1] + val_, pf[2] + val_)
+
                 pools = (pool_factor, addit(pool_factor, 1), addit(pool_factor, 2), addit(pool_factor, 5))
             for stride in pools:
                 grid_vals_coarse, grid_coarse = grid.max_pool(pool_factor, grid_vals, stride=stride)
                 grid_vals_coarse = grid_vals_coarse.jdata
                 if isinstance(stride, int):
                     self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * stride))
-                    self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (stride - 1)))
+                    self.assertTrue(
+                        torch.allclose(
+                            grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (stride - 1)
+                        )
+                    )
                 else:
-                    self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(stride).to(device)))
-                    self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(stride) - 1).to(device)))
+                    self.assertTrue(
+                        torch.allclose(
+                            grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(stride).to(device)
+                        )
+                    )
+                    self.assertTrue(
+                        torch.allclose(
+                            grid_coarse.origins[0],
+                            grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(stride) - 1).to(device),
+                        )
+                    )
 
                 # Pytorch pooling
                 torch_pool_op = torch.nn.MaxPool3d(pool_factor, stride=stride, ceil_mode=True)
                 # We compy everything to the CPU because it's noticeably faster to iterate and copy this way
-                grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device='cpu', dtype=dtype)
+                grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device="cpu", dtype=dtype)
                 grid_ijk_cpu = grid.ijk.jdata.cpu()
                 grid_vals_cpu = grid_vals.cpu()
                 for i, coord in enumerate(grid_ijk_cpu):
@@ -735,7 +787,7 @@ def addit(pf, val_):
                 grid_vals_t = grid_vals_t.permute(3, 0, 1, 2).contiguous()
                 grid_vals_t_coarse = torch_pool_op(grid_vals_t.unsqueeze(0)).squeeze()
 
-                grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device='cpu')
+                grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device="cpu")
                 grid_coarse_ijk_cpu = grid_coarse.ijk.jdata.cpu()
                 for i, coord in enumerate(grid_coarse_ijk_cpu):
                     grid_vals_coarse_t_flat[i] = grid_vals_t_coarse[:, coord[0], coord[1], coord[2]]
@@ -749,7 +801,7 @@ def test_max_pool_grad(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         for pool_factor in ((2, 3, 1), 1, 2, 3, 4, 5, 7, 15, 10):
             grid_vals = torch.rand(grid.total_voxels, 3).to(device).to(dtype) + 0.5
             grid_vals.requires_grad = True
@@ -758,10 +810,23 @@ def test_max_pool_grad(self, device, dtype, mutable):
             grid_vals_coarse = grid_vals_coarse.jdata
             if isinstance(pool_factor, int):
                 self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * pool_factor))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)))
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)
+                    )
+                )
             else:
-                self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device)))
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)
+                    )
+                )
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0],
+                        grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device),
+                    )
+                )
 
             loss = (grid_vals_coarse.pow(3) * -1.111).sum()
             loss.backward()
@@ -769,8 +834,10 @@ def test_max_pool_grad(self, device, dtype, mutable):
             assert grid_vals.grad is not None  # Removes type errors with .grad
 
             grid_vals_grad = grid_vals.grad.clone()
-            self.assertEqual((grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
-                             grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1])
+            self.assertEqual(
+                (grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
+                grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1],
+            )
 
             mask = grid_vals_grad.abs() > 0
             a = torch.sort(torch.tensor([x.item() for x in grid_vals[mask[:, 0]][:, 0]]))[0]
@@ -801,17 +868,20 @@ def test_max_pool_grad(self, device, dtype, mutable):
 
             assert grid_vals_t.grad is not None  # Removes type errors with .grad
 
-            grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device='cpu')
+            grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device="cpu")
             grid_ijk_cpu = grid.ijk.jdata.cpu()
             grid_vals_t_cpu_grad = grid_vals_t.grad.cpu()
             for i, coord in enumerate(grid_ijk_cpu):
                 grid_vals_grad_t_flat[i] = grid_vals_t_cpu_grad[:, coord[0], coord[1], coord[2]]
             grid_vals_grad_t_flat = grid_vals_grad_t_flat.to(device)
 
-            expected_nnz = grid_vals_t_coarse.shape[1] * grid_vals_t_coarse.shape[2] * \
-                           grid_vals_t_coarse.shape[3] * grid_vals_t_coarse.shape[0]
-            self.assertEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(),
-                             expected_nnz)
+            expected_nnz = (
+                grid_vals_t_coarse.shape[1]
+                * grid_vals_t_coarse.shape[2]
+                * grid_vals_t_coarse.shape[3]
+                * grid_vals_t_coarse.shape[0]
+            )
+            self.assertEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(), expected_nnz)
 
             self.assertEqual(torch.abs(grid_vals_grad_t_flat - grid_vals_grad).max().item(), 0.0)
 
@@ -822,7 +892,7 @@ def test_avg_pool_grad(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         for pool_factor in ((2, 4, 5), 1, 2, 4, 5, 10):
             grid_vals = torch.randn(grid.total_voxels, 3, device=device, dtype=dtype) + 0.5
             grid_vals.requires_grad = True
@@ -832,11 +902,24 @@ def test_avg_pool_grad(self, device, dtype, mutable):
 
             if isinstance(pool_factor, int):
                 self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * pool_factor))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)))
-                npool_vox = pool_factor ** 3
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)
+                    )
+                )
+                npool_vox = pool_factor**3
             else:
-                self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device)))
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * torch.tensor(pool_factor).to(device)
+                    )
+                )
+                self.assertTrue(
+                    torch.allclose(
+                        grid_coarse.origins[0],
+                        grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (torch.tensor(pool_factor) - 1).to(device),
+                    )
+                )
                 npool_vox = pool_factor[0] * pool_factor[1] * pool_factor[2]
             # self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * pool_factor))
             # self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)))
@@ -847,8 +930,10 @@ def test_avg_pool_grad(self, device, dtype, mutable):
             assert grid_vals.grad is not None  # Removes type errors with .grad
 
             grid_vals_grad = grid_vals.grad.clone()
-            self.assertLessEqual((grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
-                                 grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1] * npool_vox)
+            self.assertLessEqual(
+                (grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
+                grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1] * npool_vox,
+            )
 
             grid_vals.grad.zero_()
 
@@ -866,27 +951,38 @@ def test_avg_pool_grad(self, device, dtype, mutable):
             x, y, z = torch.split(grid_coarse.ijk.jdata, 1, dim=-1)
             grid_vals_coarse_t_flat = grid_vals_t_coarse[:, x.squeeze(), y.squeeze(), z.squeeze()].permute(1, 0)
 
-            diff_idxs = torch.where(~torch.isclose(grid_vals_coarse, grid_vals_coarse_t_flat, atol=dtype_to_atol(dtype), rtol=dtype_to_atol(dtype)))
-            self.assertTrue(torch.allclose(grid_vals_coarse, grid_vals_coarse_t_flat, atol=dtype_to_atol(dtype), rtol=dtype_to_atol(dtype)),
-                            f"({pool_factor}) Exceed at {diff_idxs}:\n{grid_vals_coarse[diff_idxs][:10]}\nvs\n{grid_vals_coarse_t_flat[diff_idxs][:10]}")
-
+            diff_idxs = torch.where(
+                ~torch.isclose(
+                    grid_vals_coarse, grid_vals_coarse_t_flat, atol=dtype_to_atol(dtype), rtol=dtype_to_atol(dtype)
+                )
+            )
+            self.assertTrue(
+                torch.allclose(
+                    grid_vals_coarse, grid_vals_coarse_t_flat, atol=dtype_to_atol(dtype), rtol=dtype_to_atol(dtype)
+                ),
+                f"({pool_factor}) Exceed at {diff_idxs}:\n{grid_vals_coarse[diff_idxs][:10]}\nvs\n{grid_vals_coarse_t_flat[diff_idxs][:10]}",
+            )
 
             loss = (grid_vals_t_coarse.pow(3) * -1.111).sum()
             loss.backward()
 
             assert grid_vals_t.grad is not None  # Removes type errors with .grad
 
-            grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device='cpu')
+            grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device="cpu")
             grid_ijk_cpu = grid.ijk.jdata.cpu()
             grid_vals_t_cpu_grad = grid_vals_t.grad.cpu()
             for i, coord in enumerate(grid_ijk_cpu):
                 grid_vals_grad_t_flat[i] = grid_vals_t_cpu_grad[:, coord[0], coord[1], coord[2]]
             grid_vals_grad_t_flat = grid_vals_grad_t_flat.to(device)
 
-            expected_nnz_ub = grid_vals_t_coarse.shape[1] * grid_vals_t_coarse.shape[2] * \
-                           grid_vals_t_coarse.shape[3] * grid_vals_t_coarse.shape[0] * npool_vox
-            self.assertLessEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(),
-                                 expected_nnz_ub)
+            expected_nnz_ub = (
+                grid_vals_t_coarse.shape[1]
+                * grid_vals_t_coarse.shape[2]
+                * grid_vals_t_coarse.shape[3]
+                * grid_vals_t_coarse.shape[0]
+                * npool_vox
+            )
+            self.assertLessEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(), expected_nnz_ub)
 
             self.assertTrue(torch.abs(grid_vals_grad_t_flat - grid_vals_grad).max().item() < dtype_to_atol(dtype))
 
@@ -897,16 +993,18 @@ def test_strided_max_pool_grad(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         for pool_factor in (2, 4, 5, 10):
-            for stride in (pool_factor, pool_factor+1, pool_factor + 2, pool_factor + 5):
+            for stride in (pool_factor, pool_factor + 1, pool_factor + 2, pool_factor + 5):
                 grid_vals = torch.rand(grid.total_voxels, 3).to(device).to(dtype) + 0.5
                 grid_vals.requires_grad = True
 
                 grid_vals_coarse, grid_coarse = grid.max_pool(pool_factor, grid_vals, stride=stride)
                 grid_vals_coarse = grid_vals_coarse.jdata
                 self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * stride))
-                self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (stride - 1)))
+                self.assertTrue(
+                    torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (stride - 1))
+                )
 
                 loss = (grid_vals_coarse.pow(3) * -1.111).sum()
                 loss.backward()
@@ -914,8 +1012,10 @@ def test_strided_max_pool_grad(self, device, dtype, mutable):
                 assert grid_vals.grad is not None  # Removes type errors with .grad
 
                 grid_vals_grad = grid_vals.grad.clone()
-                self.assertEqual((grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
-                                grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1])
+                self.assertEqual(
+                    (grid_vals_grad.abs() > 0).sum().to(torch.int32).item(),
+                    grid_vals_coarse.shape[0] * grid_vals_coarse.shape[1],
+                )
 
                 mask = grid_vals_grad.abs() > 0
                 a = torch.sort(torch.tensor([x.item() for x in grid_vals[mask[:, 0]][:, 0]]))[0]
@@ -946,17 +1046,20 @@ def test_strided_max_pool_grad(self, device, dtype, mutable):
 
                 assert grid_vals_t.grad is not None  # Removes type errors with .grad
 
-                grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device='cpu')
+                grid_vals_grad_t_flat = torch.zeros_like(grid_vals_grad, device="cpu")
                 grid_ijk_cpu = grid.ijk.jdata.cpu()
                 grid_vals_t_cpu_grad = grid_vals_t.grad.cpu()
                 for i, coord in enumerate(grid_ijk_cpu):
                     grid_vals_grad_t_flat[i] = grid_vals_t_cpu_grad[:, coord[0], coord[1], coord[2]]
                 grid_vals_grad_t_flat = grid_vals_grad_t_flat.to(device)
 
-                expected_nnz = grid_vals_t_coarse.shape[1] * grid_vals_t_coarse.shape[2] * \
-                            grid_vals_t_coarse.shape[3] * grid_vals_t_coarse.shape[0]
-                self.assertEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(),
-                                expected_nnz)
+                expected_nnz = (
+                    grid_vals_t_coarse.shape[1]
+                    * grid_vals_t_coarse.shape[2]
+                    * grid_vals_t_coarse.shape[3]
+                    * grid_vals_t_coarse.shape[0]
+                )
+                self.assertEqual((grid_vals_grad_t_flat.abs() > 0).to(torch.int32).sum().item(), expected_nnz)
 
                 self.assertEqual(torch.abs(grid_vals_grad_t_flat - grid_vals_grad).max().item(), 0.0)
 
@@ -979,7 +1082,7 @@ def test_to_device(self, device, dtype, mutable):
 
         pts = torch.randn(10000, 3).to(device=device, dtype=dtype)
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_points(pts, [-1]*3, [1]*3, vox_size, vox_origin)
+        grid.set_from_points(pts, [-1] * 3, [1] * 3, vox_size, vox_origin)
         random_drop_points_if_mutable(grid)
         grid = grid.dual_grid()
 
@@ -988,26 +1091,28 @@ def test_to_device(self, device, dtype, mutable):
         self.assertTrue(torch.allclose(pred_dual_coordinates, target_dual_coordinates, atol=dtype_to_atol(dtype)))
         self.assertEqual(grid.device.type, torch.device(device).type)
 
-        to_device = torch.device('cpu')
+        to_device = torch.device("cpu")
         grid2 = grid.to(to_device)
         target_dual_coordinates = ((pts - vox_origin) / vox_size) + 0.5
         if torch.device(device).type != to_device.type:
             with self.assertRaises(RuntimeError):
                 pred_dual_coordinates = grid2.world_to_grid(pts).jdata
         pred_dual_coordinates = grid2.world_to_grid(pts.to(to_device)).jdata
-        self.assertTrue(torch.allclose(pred_dual_coordinates,
-                                       target_dual_coordinates.to(to_device), atol=dtype_to_atol(dtype)))
+        self.assertTrue(
+            torch.allclose(pred_dual_coordinates, target_dual_coordinates.to(to_device), atol=dtype_to_atol(dtype))
+        )
         self.assertEqual(grid2.device, to_device)
 
-        to_device = torch.device('cuda:0')
+        to_device = torch.device("cuda:0")
         grid2 = grid.to(to_device)
         target_dual_coordinates = ((pts - vox_origin) / vox_size) + 0.5
         if torch.device(device).type != to_device.type:
             with self.assertRaises(RuntimeError):
                 pred_dual_coordinates = grid2.world_to_grid(pts).jdata
         pred_dual_coordinates = grid2.world_to_grid(pts.to(to_device)).jdata
-        self.assertTrue(torch.allclose(pred_dual_coordinates,
-                                       target_dual_coordinates.to(to_device), atol=dtype_to_atol(dtype)))
+        self.assertTrue(
+            torch.allclose(pred_dual_coordinates, target_dual_coordinates.to(to_device), atol=dtype_to_atol(dtype))
+        )
         self.assertEqual(grid2.device, to_device)
 
     @parameterized.expand(all_device_dtype_combos)
@@ -1018,8 +1123,12 @@ def test_fill_to_grid(self, device, dtype, mutable):
         random_points_b1 = torch.randn(100, 3, device=device, dtype=dtype)
         random_points_b2 = torch.randn(100, 3, device=device, dtype=dtype)
 
-        grid1.set_from_points(JaggedTensor([random_points_b1[:70], random_points_b2[:70]]), voxel_sizes=0.01, origins=[0, 0, 0])
-        grid2.set_from_points(JaggedTensor([random_points_b1[30:], random_points_b2[30:]]), voxel_sizes=0.01, origins=[0, 0, 0])
+        grid1.set_from_points(
+            JaggedTensor([random_points_b1[:70], random_points_b2[:70]]), voxel_sizes=0.01, origins=[0, 0, 0]
+        )
+        grid2.set_from_points(
+            JaggedTensor([random_points_b1[30:], random_points_b2[30:]]), voxel_sizes=0.01, origins=[0, 0, 0]
+        )
 
         random_features_b1 = torch.randn(grid1[0].total_voxels, 32, device=device, dtype=dtype)
         random_features_b2 = torch.randn(grid1[1].total_voxels, 32, device=device, dtype=dtype)
@@ -1066,12 +1175,12 @@ def test_grid_construction(self, device, dtype, mutable):
 
         def build_from_ijk(vsize, vorigin):
             grid = GridBatch(mutable=mutable, device=device)
-            grid.set_from_ijk(rand_ijk, [0]*3, [0]*3, vsize, vorigin)
+            grid.set_from_ijk(rand_ijk, [0] * 3, [0] * 3, vsize, vorigin)
             return grid
 
         def build_from_pts(vsize, vorigin):
             grid = GridBatch(mutable=mutable, device=device)
-            grid.set_from_points(rand_pts, [0]*3, [0]*3, vsize, vorigin)
+            grid.set_from_points(rand_pts, [0] * 3, [0] * 3, vsize, vorigin)
             return grid
 
         def build_from_pts_nn(vsize, vorigin):
@@ -1089,7 +1198,7 @@ def build_from_dense(vsize, vorigin):
 
         pts = torch.randn(10000, 3).to(device=device, dtype=dtype)
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_points(pts, [-1]*3, [1]*3, vox_size, vox_origin)
+        grid.set_from_points(pts, [-1] * 3, [1] * 3, vox_size, vox_origin)
         random_drop_points_if_mutable(grid)
 
         for builder in [build_from_ijk, build_from_pts, build_from_pts_nn, build_from_dense]:
@@ -1097,40 +1206,36 @@ def build_from_dense(vsize, vorigin):
                 grid = builder(vox_size, 0.01)  # type: ignore
 
             with self.assertRaises(ValueError):
-                grid = builder(-vox_size, [0.01]*3)
+                grid = builder(-vox_size, [0.01] * 3)
 
             with self.assertRaises(ValueError):
-                grid = builder(-1.0, [0.01]*3)
+                grid = builder(-1.0, [0.01] * 3)
 
             with self.assertRaises(ValueError):
-                grid = builder(vox_size * 0.0, [0.01]*3)
+                grid = builder(vox_size * 0.0, [0.01] * 3)
 
             with self.assertRaises(ValueError):
-                grid = builder(0.0, [0.01]*3)
+                grid = builder(0.0, [0.01] * 3)
 
             with self.assertRaises(ValueError):
-                grid = builder(vox_size, [0.01]*4)
+                grid = builder(vox_size, [0.01] * 4)
 
             with self.assertRaises(ValueError):
-                grid = builder(vox_size, [0.01]*2)
+                grid = builder(vox_size, [0.01] * 2)
 
             with self.assertRaises(ValueError):
-                grid = builder(vox_size, [0.01]*1)
+                grid = builder(vox_size, [0.01] * 1)
 
     @parameterized.expand(all_device_dtype_combos)
     def test_ijk_to_inv_index(self, device, dtype, mutable):
         vox_size = 0.1
 
         # Unique IJK since for duplicates the permutation is non-bijective
-        ijk = list(
-            set(
-                [tuple([a for a in (np.random.randn(3) / vox_size).astype(np.int32)]) for _ in range(10000)]
-            )
-        )
+        ijk = list(set([tuple([a for a in (np.random.randn(3) / vox_size).astype(np.int32)]) for _ in range(10000)]))
         ijk = torch.from_numpy(np.array([list(a) for a in ijk])).to(torch.int32).to(device)
 
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_ijk(ijk, voxel_sizes=vox_size, origins=[0.]*3)
+        grid.set_from_ijk(ijk, voxel_sizes=vox_size, origins=[0.0] * 3)
 
         inv_index = grid.ijk_to_inv_index(ijk).jdata
 
@@ -1145,16 +1250,16 @@ def test_ijk_to_inv_index(self, device, dtype, mutable):
         # Pick random ijk subset
         rand_ijks = []
         for i in range(grid.grid_count):
-            ijks = grid.ijk.jdata[grid.ijk.jidx==i]
-            rand_ijks.append(torch.unique(ijks[torch.randint(len(ijks), (50,), device = ijks.device)], dim=0) )
+            ijks = grid.ijk.jdata[grid.ijk.jidx == i]
+            rand_ijks.append(torch.unique(ijks[torch.randint(len(ijks), (50,), device=ijks.device)], dim=0))
 
         rand_ijks = fvdb.JaggedTensor(rand_ijks)
 
         rand_ijk_inv_indices = grid.ijk_to_inv_index(rand_ijks)
 
         # valid ijk indices
-        inv_rand_ijk = grid.ijk.jdata[rand_ijk_inv_indices.jdata!= -1]
-        assert(len(inv_rand_ijk) == len(rand_ijks.jdata))
+        inv_rand_ijk = grid.ijk.jdata[rand_ijk_inv_indices.jdata != -1]
+        assert len(inv_rand_ijk) == len(rand_ijks.jdata)
         inv_rand_ijk = rand_ijks.jagged_like(inv_rand_ijk)
 
         def check_order(t1: torch.Tensor, t2: torch.Tensor):
@@ -1188,16 +1293,22 @@ def test_ijk_to_inv_index_batched(self, device, dtype, mutable):
         vox_size = 0.1
 
         # Unique IJK since for duplicates the permutation is non-bijective
-        ijk = [list(
-            set(
-                [tuple([a for a in (np.random.randn(3) / vox_size).astype(np.int32)]) for _ in range(100 + np.random.randint(10))]
+        ijk = [
+            list(
+                set(
+                    [
+                        tuple([a for a in (np.random.randn(3) / vox_size).astype(np.int32)])
+                        for _ in range(100 + np.random.randint(10))
+                    ]
+                )
             )
-        ) for _ in range(3)]
+            for _ in range(3)
+        ]
         ijk = [torch.from_numpy(np.array([list(a) for a in ijk_i])).to(torch.int32).to(device) for ijk_i in ijk]
         ijk = fvdb.JaggedTensor(ijk)
 
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_ijk(ijk, voxel_sizes=vox_size, origins=[0.]*3)
+        grid.set_from_ijk(ijk, voxel_sizes=vox_size, origins=[0.0] * 3)
 
         inv_index = grid.ijk_to_inv_index(ijk).jdata
 
@@ -1215,15 +1326,15 @@ def test_ijk_to_inv_index_batched(self, device, dtype, mutable):
         rand_ijks = []
         for i in range(grid.grid_count):
             ijks = grid[i].ijk.jdata
-            rand_ijks.append(torch.unique(ijks[torch.randint(len(ijks), (50,), device = ijks.device)], dim=0))
+            rand_ijks.append(torch.unique(ijks[torch.randint(len(ijks), (50,), device=ijks.device)], dim=0))
 
         rand_ijks = fvdb.JaggedTensor(rand_ijks)
 
         rand_ijk_inv_indices = grid.ijk_to_inv_index(rand_ijks)
 
         # # valid ijk indices
-        inv_rand_ijk = grid.ijk[rand_ijk_inv_indices!= -1]
-        assert(len(inv_rand_ijk.jdata) == len(rand_ijks.jdata))
+        inv_rand_ijk = grid.ijk[rand_ijk_inv_indices != -1]
+        assert len(inv_rand_ijk.jdata) == len(rand_ijks.jdata)
 
         def check_order(t1: torch.Tensor, t2: torch.Tensor):
             t1_list = t1.tolist()
@@ -1274,16 +1385,24 @@ def test_ray_implicit_intersection(self, device, dtype, mutable):
         # Generate the SDF for a sphere on a grid
         N = 32
         sphere_rad = 0.35
-        ii, jj, kk, = torch.meshgrid([torch.arange(N)]*3, indexing='ij')
-        xx, yy, zz = ii.float() / (float(N) - 1) - 0.5, jj.float() / (float(N) - 1) - 0.5, kk.float() / (float(N) - 1) - 0.5
+        (
+            ii,
+            jj,
+            kk,
+        ) = torch.meshgrid([torch.arange(N)] * 3, indexing="ij")
+        xx, yy, zz = (
+            ii.float() / (float(N) - 1) - 0.5,
+            jj.float() / (float(N) - 1) - 0.5,
+            kk.float() / (float(N) - 1) - 0.5,
+        )
         sphere_sdf = torch.sqrt(xx**2 + yy**2 + zz**2) - sphere_rad
 
         # Generate a bunch of points on the sphere which we'll send rays to
-        cam_o = torch.tensor([0., 0., -2.]).unsqueeze(0).repeat(100, 1)
+        cam_o = torch.tensor([0.0, 0.0, -2.0]).unsqueeze(0).repeat(100, 1)
         cam_targets = torch.randn(100, 3)
         cam_targets /= torch.norm(cam_targets, dim=-1, keepdim=True)
         cam_targets *= sphere_rad
-        cam_targets += (0.5 - 0.5 / N)
+        cam_targets += 0.5 - 0.5 / N
         cam_d = cam_targets - cam_o
         cam_d /= torch.norm(cam_d, dim=-1, keepdim=True)
 
@@ -1292,7 +1411,9 @@ def test_ray_implicit_intersection(self, device, dtype, mutable):
 
         # Build a grid with the SDF
         grid = GridBatch(device=device, mutable=mutable)
-        grid.set_from_dense_grid(1, [sphere_sdf.shape[i] for i in range(3)], [0]*3, voxel_sizes=1.0 / N, origins=[0]*3)
+        grid.set_from_dense_grid(
+            1, [sphere_sdf.shape[i] for i in range(3)], [0] * 3, voxel_sizes=1.0 / N, origins=[0] * 3
+        )
         sdf_p = grid.read_from_dense(sphere_sdf.unsqueeze(-1).unsqueeze(0)).jdata.squeeze()  # permuted sdf values
 
         # Intersect rays with the SDF
@@ -1316,23 +1437,36 @@ def test_ray_implicit_intersection(self, device, dtype, mutable):
         # ps.register_point_cloud("hits", hit_pts.cpu().numpy())
         # ps.show()
 
-    @test_expand(list(itertools.product(
-        ['cpu', 'cuda'],
-        [torch.float32, torch.float64]
-    )))
+    @test_expand(list(itertools.product(["cpu", "cuda"], [torch.float32, torch.float64])))
     def test_marching_cubes(self, device, dtype):
         # Generate the SDF for a sphere on a grid
-        N = 32 if device == 'cpu' else 64
+        N = 32 if device == "cpu" else 64
         sphere_rads = [0.5, 0.33, 0.3, 0.28, 0.25]
         for batch_size in [1, 3, 5]:
             # Build a dense tensor of SDF values
-            ii, jj, kk, = torch.meshgrid([torch.arange(N, device=device)]*3, indexing='ij')  # index space [0, N-1]
-            xx, yy, zz = ii.float() / (float(N) - 1) - 0.5, jj.float() / (float(N) - 1) - 0.5, kk.float() / (float(N) - 1) - 0.5  # normalize to [-1, 1]
-            sphere_sdf = torch.stack([-torch.sqrt(xx**2 + yy**2 + zz**2) + sphere_rad for sphere_rad in sphere_rads[:batch_size]]).unsqueeze(-1)  # [B, N, N, N, 1] sdf
+            (
+                ii,
+                jj,
+                kk,
+            ) = torch.meshgrid(
+                [torch.arange(N, device=device)] * 3, indexing="ij"
+            )  # index space [0, N-1]
+            xx, yy, zz = (
+                ii.float() / (float(N) - 1) - 0.5,
+                jj.float() / (float(N) - 1) - 0.5,
+                kk.float() / (float(N) - 1) - 0.5,
+            )  # normalize to [-1, 1]
+            sphere_sdf = torch.stack(
+                [-torch.sqrt(xx**2 + yy**2 + zz**2) + sphere_rad for sphere_rad in sphere_rads[:batch_size]]
+            ).unsqueeze(
+                -1
+            )  # [B, N, N, N, 1] sdf
 
             # Build a grid with the SDF
             grid = GridBatch(device=device, mutable=False)
-            grid.set_from_dense_grid(batch_size, [sphere_sdf[0].shape[i] for i in range(3)], [0]*3, voxel_sizes=1.0 / N, origins=[0]*3)
+            grid.set_from_dense_grid(
+                batch_size, [sphere_sdf[0].shape[i] for i in range(3)], [0] * 3, voxel_sizes=1.0 / N, origins=[0] * 3
+            )
             sdf_p = grid.read_from_dense(sphere_sdf)  # permuted sdf values
 
             for level in [0.0, 0.2, -0.2]:
@@ -1340,14 +1474,11 @@ def test_marching_cubes(self, device, dtype):
 
                 for bi in range(batch_size):
                     mesh_radius = torch.linalg.norm(
-                        v[bi].jdata - torch.tensor([[0.5]*3], device=device, dtype=dtype),
-                        axis=1
+                        v[bi].jdata - torch.tensor([[0.5] * 3], device=device, dtype=dtype), axis=1
                     )
                     vox_size = torch.norm(grid.voxel_sizes[bi])
                     self.assertTrue(torch.all(mesh_radius - sphere_rads[bi] < vox_size / 2.0 - level))
-                    self.assertTrue(torch.all(torch.logical_and(
-                        f[bi].jdata >= 0, f[bi].jdata < v[bi].jdata.shape[0]
-                    )))
+                    self.assertTrue(torch.all(torch.logical_and(f[bi].jdata >= 0, f[bi].jdata < v[bi].jdata.shape[0])))
                 # import polyscope as ps
                 # ps.init()
                 # ps.register_surface_mesh("marching_cubes", v.cpu()[0].jdata.numpy(), f.cpu()[0].jdata.numpy())
@@ -1358,7 +1489,9 @@ def test_subdivide_empty_grid(self, device, dtype, mutable):
         grid = GridBatch(device=device, mutable=mutable)
         grid.set_from_dense_grid(1, [32, 32, 32], [0, 0, 0], voxel_sizes=1.0 / 32, origins=[0, 0, 0])
         values = torch.randn(grid.total_voxels, 17, device=device, dtype=dtype)
-        values, subgrid = grid.subdivide(1, values, mask=torch.zeros(grid.total_voxels, dtype=torch.bool, device=device))
+        values, subgrid = grid.subdivide(
+            1, values, mask=torch.zeros(grid.total_voxels, dtype=torch.bool, device=device)
+        )
         self.assertTrue(subgrid.total_voxels == 0)
         self.assertTrue(values.rshape[0] == 0)
         self.assertTrue(values.rshape[1] == 17)
@@ -1368,7 +1501,9 @@ def test_conv_empty_grid(self, device, dtype, mutable):
         grid = GridBatch(device=device, mutable=mutable)
         grid.set_from_dense_grid(1, [32, 32, 32], [0, 0, 0], voxel_sizes=1.0 / 32, origins=[0, 0, 0])
         values_in = torch.randn(grid.total_voxels, 17, device=device, dtype=dtype)
-        values, subgrid = grid.subdivide(1, values_in, mask=torch.zeros(grid.total_voxels, dtype=torch.bool, device=device))
+        values, subgrid = grid.subdivide(
+            1, values_in, mask=torch.zeros(grid.total_voxels, dtype=torch.bool, device=device)
+        )
         self.assertTrue(subgrid.total_voxels == 0)
         self.assertTrue(values.rshape[0] == 0)
         self.assertTrue(values.rshape[1] == 17)
@@ -1391,16 +1526,16 @@ def test_setting_transform_on_empty_batch_fails(self, device, dtype, mutable):
     @parameterized.expand(all_device_dtype_combos)
     def test_bbox_attrs(self, device, dtype, mutable):
         grid = GridBatch(device=device, mutable=mutable)
-        self.assertTrue(torch.equal(grid.bbox, torch.empty(0,2,3, device=device)))
+        self.assertTrue(torch.equal(grid.bbox, torch.empty(0, 2, 3, device=device)))
         grid.set_from_dense_grid(1, [32, 32, 32], [0, 0, 0], voxel_sizes=1.0 / 32, origins=[0, 0, 0])
-        self.assertTrue(torch.equal(grid.bbox,torch.tensor([[[ 0,  0,  0],[31, 31, 31]]], device=device)))
-        self.assertTrue(torch.equal(grid.dual_bbox,torch.tensor([[[ 0,  0,  0],[32, 32, 32]]], device=device)))
-        self.assertTrue(torch.equal(grid.total_bbox,torch.tensor([[ 0,  0,  0],[31, 31, 31]], device=device)))
+        self.assertTrue(torch.equal(grid.bbox, torch.tensor([[[0, 0, 0], [31, 31, 31]]], device=device)))
+        self.assertTrue(torch.equal(grid.dual_bbox, torch.tensor([[[0, 0, 0], [32, 32, 32]]], device=device)))
+        self.assertTrue(torch.equal(grid.total_bbox, torch.tensor([[0, 0, 0], [31, 31, 31]], device=device)))
 
     @parameterized.expand(all_device_dtype_combos)
     def test_clip_grid(self, device, dtype, mutable):
         # TODO: issue #196
-        if device == 'cpu' and mutable is True:
+        if device == "cpu" and mutable is True:
             return
 
         grid = GridBatch(device=device, mutable=mutable)
@@ -1413,7 +1548,7 @@ def test_clip_grid(self, device, dtype, mutable):
 
         grid.set_from_dense_grid(1, [32, 32, 32], [-2, -2, -2], voxel_sizes=1.0 / 32, origins=[0, 0, 0])
         values_in = torch.randn(grid.total_voxels, 17, device=device, dtype=dtype)
-        clipped_data, clipped_grid = grid.clip(values_in, [[-2,-2,-2]], [[5, 5, 5]])
+        clipped_data, clipped_grid = grid.clip(values_in, [[-2, -2, -2]], [[5, 5, 5]])
         self.assertTrue(clipped_grid.num_voxels == 8**3)
         self.assertTrue(clipped_data.jdata.shape[0] == 8**3)
         self.assertTrue(torch.equal(clipped_data.joffsets, clipped_grid.joffsets))
@@ -1435,9 +1570,9 @@ def test_clip_grid(self, device, dtype, mutable):
         self.assertTrue(torch.all(features.grad == torch.zeros_like(features.grad)))
         self.assertTrue(not torch.all(features.grad == clipped_features_grad))
 
-        ijk_clip_mask = torch.all(grid.ijk.jdata<=5,1)
+        ijk_clip_mask = torch.all(grid.ijk.jdata <= 5, 1)
 
-        loss = (features[ijk_clip_mask.repeat(num_features,1).swapaxes(0,1)].pow(3)).sum()
+        loss = (features[ijk_clip_mask.repeat(num_features, 1).swapaxes(0, 1)].pow(3)).sum()
         loss.backward()
         self.assertTrue(torch.equal(clipped_features_grad, features.grad))
 
@@ -1464,38 +1599,36 @@ def test_dual_without_border(self, device, dtype, mutable):
                 ijk2_i = set([tuple(ijk2[j].cpu().numpy().tolist()) for j in range(ijk2.shape[0])])
                 self.assertTrue(ijk1_i == ijk2_i)
 
-    @parameterized.expand(['cuda', 'cpu'])
+    @parameterized.expand(["cuda", "cpu"])
     def test_max_grids(self, device):
         dtype = torch.float32
         VAL_BIG = fvdb.GridBatch.max_grids_per_batch + 1
         VAL_OKAY = fvdb.GridBatch.max_grids_per_batch
-        pts_too_big = fvdb.JaggedTensor(
-            [torch.randn(2, 3).to(device=device, dtype=dtype)] * VAL_BIG)
-        ijk_too_big = fvdb.JaggedTensor(
-            [(torch.randn(2, 3).to(device=device, dtype=dtype) * 100.0).int()] * VAL_BIG)
-        faces_too_big = fvdb.JaggedTensor(
-            [torch.randint(2, (2, 3)).to(device=device)] * VAL_BIG)
+        pts_too_big = fvdb.JaggedTensor([torch.randn(2, 3).to(device=device, dtype=dtype)] * VAL_BIG)
+        ijk_too_big = fvdb.JaggedTensor([(torch.randn(2, 3).to(device=device, dtype=dtype) * 100.0).int()] * VAL_BIG)
+        faces_too_big = fvdb.JaggedTensor([torch.randint(2, (2, 3)).to(device=device)] * VAL_BIG)
 
         with self.assertRaises(ValueError):
-            fvdb.sparse_grid_from_points(pts_too_big, [0]*3, [0]*3, 1.0, [0]*3)
+            fvdb.sparse_grid_from_points(pts_too_big, [0] * 3, [0] * 3, 1.0, [0] * 3)
 
         with self.assertRaises(ValueError):
-            fvdb.sparse_grid_from_ijk(ijk_too_big, voxel_sizes=1.0, origins=[0]*3)
+            fvdb.sparse_grid_from_ijk(ijk_too_big, voxel_sizes=1.0, origins=[0] * 3)
 
         with self.assertRaises(ValueError):
-            fvdb.sparse_grid_from_mesh(pts_too_big, faces_too_big, voxel_sizes=1.0, origins=[0]*3)
+            fvdb.sparse_grid_from_mesh(pts_too_big, faces_too_big, voxel_sizes=1.0, origins=[0] * 3)
 
         with self.assertRaises(ValueError):
-            fvdb.sparse_grid_from_nearest_voxels_to_points(pts_too_big, voxel_sizes=1.0, origins=[0]*3)
+            fvdb.sparse_grid_from_nearest_voxels_to_points(pts_too_big, voxel_sizes=1.0, origins=[0] * 3)
 
         with self.assertRaises(ValueError):
-            fvdb.sparse_grid_from_dense(VAL_BIG, [10, 10, 10], [0, 0, 0], 1.0, [0]*3)
+            fvdb.sparse_grid_from_dense(VAL_BIG, [10, 10, 10], [0, 0, 0], 1.0, [0] * 3)
+
+        fvdb.sparse_grid_from_points(pts_too_big[:-1], [0] * 3, [0] * 3, 1.0, [0] * 3)
+        fvdb.sparse_grid_from_ijk(ijk_too_big[:-1], voxel_sizes=1.0, origins=[0] * 3)
+        fvdb.sparse_grid_from_mesh(pts_too_big[:-1], faces_too_big[:-1], voxel_sizes=1.0, origins=[0] * 3)
+        fvdb.sparse_grid_from_nearest_voxels_to_points(pts_too_big[:-1], voxel_sizes=1.0, origins=[0] * 3)
+        fvdb.sparse_grid_from_dense(VAL_OKAY, [10, 10, 10], [0, 0, 0], 1.0, [0] * 3)
 
-        fvdb.sparse_grid_from_points(pts_too_big[:-1], [0]*3, [0]*3, 1.0, [0]*3)
-        fvdb.sparse_grid_from_ijk(ijk_too_big[:-1], voxel_sizes=1.0, origins=[0]*3)
-        fvdb.sparse_grid_from_mesh(pts_too_big[:-1], faces_too_big[:-1], voxel_sizes=1.0, origins=[0]*3)
-        fvdb.sparse_grid_from_nearest_voxels_to_points(pts_too_big[:-1], voxel_sizes=1.0, origins=[0]*3)
-        fvdb.sparse_grid_from_dense(VAL_OKAY, [10, 10, 10], [0, 0, 0], 1.0, [0]*3)
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_batching.py b/fvdb/tests/unit/test_batching.py
index 50c7a664de..8fa897f7ce 100644
--- a/fvdb/tests/unit/test_batching.py
+++ b/fvdb/tests/unit/test_batching.py
@@ -11,14 +11,15 @@
 
 
 all_device_dtype_combos = [
-    ['cpu', torch.float16],
-    ['cuda', torch.float16],
-    ['cpu', torch.float32],
-    ['cuda', torch.float32],
+    ["cpu", torch.float16],
+    ["cuda", torch.float16],
+    ["cpu", torch.float32],
+    ["cuda", torch.float32],
 ]
 
 NVOX = 1_000_000
 
+
 class TestBatching(unittest.TestCase):
     def setUp(self):
         pass
@@ -27,30 +28,33 @@ def setUp(self):
     def test_getting_subgrids(self, device, dtype):
         num_grids = np.random.randint(32, 64)
         idx = np.random.randint(num_grids)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
         gridbatch.set_from_points(randpts, voxel_sizes=0.01)
         self.assertTrue(gridbatch.is_contiguous())
 
-        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx]:gridbatch.joffsets[idx+1]]
+        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx] : gridbatch.joffsets[idx + 1]]
         voxels_idx_pred = gridbatch[idx].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(gridbatch.is_contiguous())
 
         # Negative indices
         idx = -idx
-        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx-1]:gridbatch.joffsets[idx]]
+        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx - 1] : gridbatch.joffsets[idx]]
         voxels_idx_pred = gridbatch[idx].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(gridbatch.is_contiguous())
 
         # Negative indices
         idx = -1
-        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx-1]:gridbatch.joffsets[idx]]
+        voxels_idx_target = gridbatch.ijk.jdata[gridbatch.joffsets[idx - 1] : gridbatch.joffsets[idx]]
         voxels_idx_pred = gridbatch[idx].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertFalse(gridbatch[idx].is_contiguous())
@@ -60,9 +64,12 @@ def test_getting_subgrids(self, device, dtype):
     def test_getting_subgrids_slice(self, device, dtype):
         num_grids = np.random.randint(32, 64)
         idx = np.random.randint(num_grids)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 40_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 40_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
@@ -72,46 +79,58 @@ def test_getting_subgrids_slice(self, device, dtype):
         all_ijk = gridbatch.ijk
 
         # Slice random segment
-        sliced_offsets = gridbatch.joffsets[idx:idx+8]
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[sliced_offsets[i]:sliced_offsets[i+1]] for i in range(sliced_offsets.shape[0] - 1)
-        ])
-        self.assertTrue(torch.equal(gridbatch[idx:idx+7].num_voxels, gridbatch.num_voxels[idx:idx+7]))
-        voxels_idx_pred = gridbatch[idx:idx+7].ijk.jdata
+        sliced_offsets = gridbatch.joffsets[idx : idx + 8]
+        voxels_idx_target = torch.cat(
+            [all_ijk.jdata[sliced_offsets[i] : sliced_offsets[i + 1]] for i in range(sliced_offsets.shape[0] - 1)]
+        )
+        self.assertTrue(torch.equal(gridbatch[idx : idx + 7].num_voxels, gridbatch.num_voxels[idx : idx + 7]))
+        voxels_idx_pred = gridbatch[idx : idx + 7].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
-        self.assertFalse(gridbatch[idx:idx+7].is_contiguous())
+        self.assertFalse(gridbatch[idx : idx + 7].is_contiguous())
         self.assertTrue(gridbatch.is_contiguous())
 
         # Slice past the end
-        sliced_offsets = gridbatch.joffsets[gridbatch.grid_count-3:gridbatch.grid_count+5]
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[sliced_offsets[i]:sliced_offsets[i+1]] for i in range(sliced_offsets.shape[0] - 1)
-        ])
-        voxels_idx_pred = gridbatch[gridbatch.grid_count-3:gridbatch.grid_count+4].ijk.jdata
+        sliced_offsets = gridbatch.joffsets[gridbatch.grid_count - 3 : gridbatch.grid_count + 5]
+        voxels_idx_target = torch.cat(
+            [all_ijk.jdata[sliced_offsets[i] : sliced_offsets[i + 1]] for i in range(sliced_offsets.shape[0] - 1)]
+        )
+        voxels_idx_pred = gridbatch[gridbatch.grid_count - 3 : gridbatch.grid_count + 4].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
-        self.assertTrue(torch.equal(gridbatch[gridbatch.grid_count-3:gridbatch.grid_count+4].num_voxels,
-                                    gridbatch.num_voxels[gridbatch.grid_count-3:gridbatch.grid_count+4]))
-        self.assertFalse(gridbatch[gridbatch.grid_count-3:gridbatch.grid_count+4].is_contiguous())
+        self.assertTrue(
+            torch.equal(
+                gridbatch[gridbatch.grid_count - 3 : gridbatch.grid_count + 4].num_voxels,
+                gridbatch.num_voxels[gridbatch.grid_count - 3 : gridbatch.grid_count + 4],
+            )
+        )
+        self.assertFalse(gridbatch[gridbatch.grid_count - 3 : gridbatch.grid_count + 4].is_contiguous())
         self.assertTrue(gridbatch.is_contiguous())
 
         # Slice with step
         idx = np.random.randint(num_grids)
         step = np.random.randint(2, 4)
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i]:gridbatch.joffsets[i+1]] for i in range(idx, min(idx+20, len(gridbatch)), step)
-        ])
-        voxels_idx_pred = gridbatch[idx:idx+20:step].ijk.jdata
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i] : gridbatch.joffsets[i + 1]]
+                for i in range(idx, min(idx + 20, len(gridbatch)), step)
+            ]
+        )
+        voxels_idx_pred = gridbatch[idx : idx + 20 : step].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
-        self.assertTrue(torch.equal(gridbatch[idx:idx+20:step].num_voxels, gridbatch.num_voxels[idx:idx+20:step]))
-        self.assertFalse(gridbatch[idx:idx+20:step].is_contiguous())
+        self.assertTrue(
+            torch.equal(gridbatch[idx : idx + 20 : step].num_voxels, gridbatch.num_voxels[idx : idx + 20 : step])
+        )
+        self.assertFalse(gridbatch[idx : idx + 20 : step].is_contiguous())
         self.assertTrue(gridbatch.is_contiguous())
 
     @parameterized.expand(all_device_dtype_combos)
     def test_getting_subgrids_integer_array(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
@@ -122,9 +141,7 @@ def test_getting_subgrids_integer_array(self, device, dtype):
 
         # permutation
         pmt = torch.randperm(gridbatch.grid_count)
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i]:gridbatch.joffsets[i+1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat([all_ijk.jdata[gridbatch.joffsets[i] : gridbatch.joffsets[i + 1]] for i in pmt])
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -133,9 +150,7 @@ def test_getting_subgrids_integer_array(self, device, dtype):
 
         # duplication
         pmt = torch.ones(2 * gridbatch.grid_count, dtype=torch.int32)
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i]:gridbatch.joffsets[i+1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat([all_ijk.jdata[gridbatch.joffsets[i] : gridbatch.joffsets[i + 1]] for i in pmt])
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -144,9 +159,12 @@ def test_getting_subgrids_integer_array(self, device, dtype):
 
         # negative indices
         pmt = -torch.arange(gridbatch.grid_count)
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in pmt
+            ]
+        )
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -157,9 +175,12 @@ def test_getting_subgrids_integer_array(self, device, dtype):
         pmt = -torch.arange(gridbatch.grid_count)
         pmt = torch.cat([pmt, -pmt])
         pmt = pmt[torch.randperm(pmt.shape[0])]
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in pmt
+            ]
+        )
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -174,9 +195,12 @@ def listify(t_):
             return [int(t_[i].item()) for i in range(t_.shape[0])]
 
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
@@ -187,9 +211,7 @@ def listify(t_):
 
         # permutation
         pmt = listify(torch.randperm(gridbatch.grid_count))
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i]:gridbatch.joffsets[i+1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat([all_ijk.jdata[gridbatch.joffsets[i] : gridbatch.joffsets[i + 1]] for i in pmt])
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -198,9 +220,7 @@ def listify(t_):
 
         # duplication
         pmt = listify(torch.ones(2 * gridbatch.grid_count, dtype=torch.int32))
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i]:gridbatch.joffsets[i+1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat([all_ijk.jdata[gridbatch.joffsets[i] : gridbatch.joffsets[i + 1]] for i in pmt])
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -209,9 +229,12 @@ def listify(t_):
 
         # negative indices
         pmt = listify(-torch.arange(gridbatch.grid_count))
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in pmt
+            ]
+        )
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -222,9 +245,12 @@ def listify(t_):
         pmt = -torch.arange(gridbatch.grid_count)
         pmt = torch.cat([pmt, -pmt])
         pmt = listify(pmt[torch.randperm(pmt.shape[0])])
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in pmt
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in pmt
+            ]
+        )
         voxels_idx_pred = gridbatch[pmt].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[pmt].num_voxels, gridbatch.num_voxels[pmt]))
@@ -234,9 +260,12 @@ def listify(t_):
     @parameterized.expand(all_device_dtype_combos)
     def test_getting_subgrids_boolean_array(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
@@ -245,9 +274,12 @@ def test_getting_subgrids_boolean_array(self, device, dtype):
         all_ijk = gridbatch.ijk
 
         mask = torch.rand(gridbatch.grid_count) > 0.5
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in torch.arange(gridbatch.grid_count)[mask]
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in torch.arange(gridbatch.grid_count)[mask]
+            ]
+        )
         voxels_idx_pred = gridbatch[mask].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
         self.assertTrue(torch.equal(gridbatch[mask].num_voxels, gridbatch.num_voxels[mask]))
@@ -257,9 +289,12 @@ def test_getting_subgrids_boolean_array(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_getting_subgrids_boolean_list(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
 
         gridbatch = fvdb.GridBatch(device)
@@ -269,9 +304,12 @@ def test_getting_subgrids_boolean_list(self, device, dtype):
 
         mask = torch.rand(gridbatch.grid_count) > 0.5
         mask = [bool(mask[i].item()) for i in range(mask.shape[0])]
-        voxels_idx_target = torch.cat([
-            all_ijk.jdata[gridbatch.joffsets[i-1 if i < 0 else i]:gridbatch.joffsets[i if i < 0 else i + 1]] for i in torch.arange(gridbatch.grid_count)[mask]
-        ])
+        voxels_idx_target = torch.cat(
+            [
+                all_ijk.jdata[gridbatch.joffsets[i - 1 if i < 0 else i] : gridbatch.joffsets[i if i < 0 else i + 1]]
+                for i in torch.arange(gridbatch.grid_count)[mask]
+            ]
+        )
         self.assertTrue(torch.equal(gridbatch[mask].num_voxels, gridbatch.num_voxels[mask]))
         voxels_idx_pred = gridbatch[mask].ijk.jdata
         self.assertTrue(torch.equal(voxels_idx_target, voxels_idx_pred))
@@ -281,9 +319,12 @@ def test_getting_subgrids_boolean_list(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_empty_grid(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
         gd = gridbatch.dual_grid()
@@ -314,9 +355,12 @@ def test_empty_grid(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_grid_cat(self, device, dtype):
         num_grids = np.random.randint(64, 128)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
         gridbatch.set_from_points(randpts, voxel_sizes=0.01)
@@ -382,9 +426,7 @@ def test_grid_cat(self, device, dtype):
                 # mask select will produce sorted results
                 num_indices, _ = torch.sort(num_indices)
 
-            pts_to_cat.extend(
-                [pts_list[int(num_indices[j].item())] for j in range(num_indices.shape[0])]
-            )
+            pts_to_cat.extend([pts_list[int(num_indices[j].item())] for j in range(num_indices.shape[0])])
             grids_to_cat.append(gridbatch[indices])
             self.assertFalse(grids_to_cat[-1].is_contiguous())
 
@@ -403,11 +445,26 @@ def test_grid_cat(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_jagged_tensor_cat(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts1 = fvdb.JaggedTensor([torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)])
-        pts2 = fvdb.JaggedTensor([torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)])
-        pts3 = fvdb.JaggedTensor([torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)])
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts1 = fvdb.JaggedTensor(
+            [
+                torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+                for _ in range(num_grids)
+            ]
+        )
+        pts2 = fvdb.JaggedTensor(
+            [
+                torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+                for _ in range(num_grids)
+            ]
+        )
+        pts3 = fvdb.JaggedTensor(
+            [
+                torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+                for _ in range(num_grids)
+            ]
+        )
 
         pts_cat0 = fvdb.jcat([pts1, pts2, pts3], dim=None)
         for g in range(num_grids):
@@ -417,26 +474,27 @@ def test_jagged_tensor_cat(self, device, dtype):
 
         pts_cat1 = fvdb.jcat([pts1, pts2, pts3], dim=0)
         for g in range(num_grids):
-            self.assertTrue(torch.equal(
-                pts_cat1[g].jdata,
-                torch.cat([pts1[g].jdata, pts2[g].jdata, pts3[g].jdata], dim=0)
-            ))
+            self.assertTrue(
+                torch.equal(pts_cat1[g].jdata, torch.cat([pts1[g].jdata, pts2[g].jdata, pts3[g].jdata], dim=0))
+            )
 
         pts4 = pts1.jagged_like(torch.rand_like(pts1.jdata))
         pts5 = pts1.jagged_like(torch.rand_like(pts1.jdata))
         pts_cat2 = fvdb.jcat([pts1, pts4, pts5], dim=1)
         for g in range(num_grids):
-            self.assertTrue(torch.equal(
-                pts_cat2[g].jdata,
-                torch.cat([pts1[g].jdata, pts4[g].jdata, pts5[g].jdata], dim=1)
-            ))
+            self.assertTrue(
+                torch.equal(pts_cat2[g].jdata, torch.cat([pts1[g].jdata, pts4[g].jdata, pts5[g].jdata], dim=1))
+            )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_contiguous(self, device, dtype):
         num_grids = np.random.randint(32, 64)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
         gridbatch.set_from_points(randpts, voxel_sizes=0.01)
@@ -460,9 +518,7 @@ def test_contiguous(self, device, dtype):
                 # mask select will produce sorted results
                 num_indices, _ = torch.sort(num_indices)
 
-            pts_to_cat.extend(
-                [pts_list[int(num_indices[j].item())] for j in range(num_indices.shape[0])]
-            )
+            pts_to_cat.extend([pts_list[int(num_indices[j].item())] for j in range(num_indices.shape[0])])
             grids_to_cat.append(gridbatch[indices])
             self.assertFalse(grids_to_cat[-1].is_contiguous())
             contig_version = grids_to_cat[-1].contiguous()
@@ -474,9 +530,12 @@ def test_contiguous(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_views_of_views(self, device, dtype):
         num_grids = 64
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
         gridbatch.set_from_points(randpts, voxel_sizes=0.01)
@@ -488,7 +547,7 @@ def test_views_of_views(self, device, dtype):
             if np.random.rand() > 0.5:
                 grids_to_cat.append(fvdb.GridBatch(device))
 
-            indices = torch.randperm(len(last_grid))[:len(last_grid)-np.random.randint(1, 3)]
+            indices = torch.randperm(len(last_grid))[: len(last_grid) - np.random.randint(1, 3)]
             num_indices = indices.clone()
 
             if i % 3 == 0:
@@ -510,9 +569,12 @@ def test_views_of_views(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_reverse_steps(self, device, dtype):
         num_grids = 64
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
         gridbatch.set_from_points(randpts, voxel_sizes=0.01)
@@ -572,14 +634,19 @@ def test_reverse_steps(self, device, dtype):
     @parameterized.expand(all_device_dtype_combos)
     def test_negative_grid_indexing(self, device, dtype):
         num_grids = 64
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         gridbatch = fvdb.GridBatch(device)
 
         # Random voxel sizes and origins
-        gridbatch.set_from_points(randpts, voxel_sizes = np.random.rand(num_grids, 3), origins = np.random.randn(num_grids, 3))
+        gridbatch.set_from_points(
+            randpts, voxel_sizes=np.random.rand(num_grids, 3), origins=np.random.randn(num_grids, 3)
+        )
 
         # check negative indexes on a few grids
         for n in [1, 3, 9, 22, 63]:
@@ -588,5 +655,6 @@ def test_negative_grid_indexing(self, device, dtype):
             self.assertTrue(torch.equal(gridbatch.bbox_at(-n), gridbatch.bbox_at(gridbatch.grid_count - n)))
             self.assertEqual(gridbatch.num_voxels_at(-n), gridbatch.num_voxels_at(num_grids - n))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_conv.py b/fvdb/tests/unit/test_conv.py
index bc57b50ba1..d33c0483a1 100644
--- a/fvdb/tests/unit/test_conv.py
+++ b/fvdb/tests/unit/test_conv.py
@@ -18,31 +18,25 @@
 from .common import random_drop_points_if_mutable, test_expand
 
 all_device_dtype_combos = [
-    ['cuda', torch.bfloat16, False, "gather_scatter"],
-    ['cuda', torch.float16, False, "gather_scatter"],
-    ['cpu', torch.float32, False, "gather_scatter"],
-    ['cuda', torch.float32, False, "gather_scatter"],
-    ['cpu', torch.float64, False, "gather_scatter"],
-    ['cuda', torch.float64, False, "gather_scatter"],
-
-    ['cuda', torch.float16, False, "igemm"],
-    ['cuda', torch.float32, False, "igemm"],
-
-    ['cuda', torch.float16, False, "igemm_sorted"],
-    ['cuda', torch.float32, False, "igemm_sorted"],
-
-    ['cuda', torch.float16, True, "gather_scatter"],
-    ['cpu', torch.float32, True, "gather_scatter"],
-    ['cuda', torch.float32, True, "gather_scatter"],
-
-    ['cuda', torch.float16, True, "igemm"],
-    ['cuda', torch.float32, True, "igemm"],
-
-    ['cuda', torch.float16, True, "igemm_sorted"],
-    ['cuda', torch.float32, True, "igemm_sorted"],
-
-    ['cpu', torch.float64, True, "gather_scatter"],
-    ['cuda', torch.float64, True, "gather_scatter"]
+    ["cuda", torch.bfloat16, False, "gather_scatter"],
+    ["cuda", torch.float16, False, "gather_scatter"],
+    ["cpu", torch.float32, False, "gather_scatter"],
+    ["cuda", torch.float32, False, "gather_scatter"],
+    ["cpu", torch.float64, False, "gather_scatter"],
+    ["cuda", torch.float64, False, "gather_scatter"],
+    ["cuda", torch.float16, False, "igemm"],
+    ["cuda", torch.float32, False, "igemm"],
+    ["cuda", torch.float16, False, "igemm_sorted"],
+    ["cuda", torch.float32, False, "igemm_sorted"],
+    ["cuda", torch.float16, True, "gather_scatter"],
+    ["cpu", torch.float32, True, "gather_scatter"],
+    ["cuda", torch.float32, True, "gather_scatter"],
+    ["cuda", torch.float16, True, "igemm"],
+    ["cuda", torch.float32, True, "igemm"],
+    ["cuda", torch.float16, True, "igemm_sorted"],
+    ["cuda", torch.float32, True, "igemm_sorted"],
+    ["cpu", torch.float64, True, "gather_scatter"],
+    ["cuda", torch.float64, True, "gather_scatter"],
 ]
 
 
@@ -83,8 +77,7 @@ def test_conv_vs_torch_dense_simple(self, device, dtype, mutable, backend):
         stride = 1
 
         vdb_features = torch.randn((int(grid.num_voxels[0].item()), 1), device=device, dtype=dtype)
-        vdb_kernels = torch.empty((1, 1, kernel_size, kernel_size, kernel_size),
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.empty((1, 1, kernel_size, kernel_size, kernel_size), dtype=dtype, device=device)
         torch.nn.init.kaiming_uniform_(vdb_kernels, a=math.sqrt(5))
 
         vdb_features.requires_grad = True
@@ -116,23 +109,24 @@ def test_conv_vs_torch_dense_simple(self, device, dtype, mutable, backend):
         dense_features_grad = torch.clone(vdb_features.grad)
         dense_kernels_grad = torch.clone(vdb_kernels.grad)
 
-        self.assertTrue(torch.allclose(out_dense_features, out_dense_features_ref),
-                        f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, dense_features_grad),
-                        f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, dense_kernels_grad),
-                        f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}")
+        self.assertTrue(
+            torch.allclose(out_dense_features, out_dense_features_ref),
+            f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_features_grad, dense_features_grad),
+            f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_kernels_grad, dense_kernels_grad),
+            f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}",
+        )
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
 
-    @test_expand(list(itertools.product(
-        [8, 64, 128],
-        [8, 64, 128],
-        [1, 4],
-        [8, 64]
-    )))
+    @test_expand(list(itertools.product([8, 64, 128], [8, 64, 128], [1, 4], [8, 64])))
     def test_conv_halo(self, in_channel, out_channel, batch_size, variant):
-        device = 'cuda'
+        device = "cuda"
         dtype = torch.float32
 
         torch.random.manual_seed(0)
@@ -142,11 +136,12 @@ def test_conv_halo(self, in_channel, out_channel, batch_size, variant):
         grid = GridBatch(device=device)
         grid.set_from_points(
             JaggedTensor([torch.randn((100, 3), device=device, dtype=dtype) for _ in range(batch_size)]),
-            voxel_sizes=0.05, origins=[0.0] * 3)
+            voxel_sizes=0.05,
+            origins=[0.0] * 3,
+        )
 
         vdb_features = grid.jagged_like(torch.randn((grid.total_voxels, in_channel), device=device, dtype=dtype))
-        vdb_kernels = torch.randn(out_channel, in_channel, 3, 3, 3,
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.randn(out_channel, in_channel, 3, 3, 3, dtype=dtype, device=device)
 
         vdb_features.jdata.requires_grad = True
         vdb_kernels.requires_grad = True
@@ -168,14 +163,15 @@ def test_conv_halo(self, in_channel, out_channel, batch_size, variant):
         # torchsparse convolution & backward (kernel arrangement is weird...)
         #   ref: torchsparse@2.0.0b/nn/utils/kernel.py
         ts_tensor = torchsparse_20.SparseTensor(
-            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1))
+            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1)
+        )
         ts_kernel = vdb_kernels.permute(2, 3, 4, 1, 0).reshape(-1, in_channel, out_channel)
         out_ts_tensor = spF.conv3d(ts_tensor, ts_kernel, 3, stride=1)
 
         # Check target grid is aligned.
-        ts_target_grid_ijk = JaggedTensor([
-            out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3]
-            for b in range(batch_size)])
+        ts_target_grid_ijk = JaggedTensor(
+            [out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3] for b in range(batch_size)]
+        )
 
         ts_features = out_ts_tensor.feats[grid.ijk_to_inv_index(ts_target_grid_ijk, cumulative=True).jdata]
         ts_features.backward(grad_out)
@@ -183,22 +179,26 @@ def test_conv_halo(self, in_channel, out_channel, batch_size, variant):
         ts_features_grad = torch.clone(vdb_features.jdata.grad)
         ts_kernels_grad = torch.clone(vdb_kernels.grad)
 
-        self.assertTrue(torch.allclose(out_vdb_features.jdata, ts_features, atol=0.02),
-                        f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, ts_features_grad, atol=0.05),
-                        f"Max dist is {torch.max(vdb_features_grad - ts_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, ts_kernels_grad, atol=0.05),
-                        f"Max dist is {torch.max(vdb_kernels_grad - ts_kernels_grad)}")
-
-    @test_expand(list(itertools.product(
-        [torch.float32],
-        [1, 4],
-        ["cutlass", "gather_scatter", "igemm", "igemm_sorted", "lggs"]
-    )))
+        self.assertTrue(
+            torch.allclose(out_vdb_features.jdata, ts_features, atol=0.02),
+            f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_features_grad, ts_features_grad, atol=0.05),
+            f"Max dist is {torch.max(vdb_features_grad - ts_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_kernels_grad, ts_kernels_grad, atol=0.05),
+            f"Max dist is {torch.max(vdb_kernels_grad - ts_kernels_grad)}",
+        )
+
+    @test_expand(
+        list(itertools.product([torch.float32], [1, 4], ["cutlass", "gather_scatter", "igemm", "igemm_sorted", "lggs"]))
+    )
     def test_special_conv(self, dtype, batch_size, backend):
         kernel_size = 3
         stride = 1
-        device = 'cuda'
+        device = "cuda"
 
         torch.random.manual_seed(0)
         in_channel, out_channel = 128, 128
@@ -206,11 +206,14 @@ def test_special_conv(self, dtype, batch_size, backend):
         grid = GridBatch(device=device)
         grid.set_from_points(
             JaggedTensor([torch.randn((100, 3), device=device, dtype=dtype) for _ in range(batch_size)]),
-            voxel_sizes=0.05, origins=[0.0] * 3)
+            voxel_sizes=0.05,
+            origins=[0.0] * 3,
+        )
 
         vdb_features = grid.jagged_like(torch.randn((grid.total_voxels, in_channel), device=device, dtype=dtype))
-        vdb_kernels = torch.randn(out_channel, in_channel, kernel_size, kernel_size, kernel_size,
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.randn(
+            out_channel, in_channel, kernel_size, kernel_size, kernel_size, dtype=dtype, device=device
+        )
 
         # Sparse convolution & backward
         spconv, vdb_target_grid, symbol = build_spconv(grid, kernel_size, stride, backend)
@@ -219,50 +222,58 @@ def test_special_conv(self, dtype, batch_size, backend):
         # torchsparse convolution & backward (kernel arrangement is weird...)
         #   ref: torchsparse@2.0.0b/nn/utils/kernel.py
         ts_tensor = torchsparse_20.SparseTensor(
-            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1))
+            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1)
+        )
         ts_kernel = vdb_kernels.permute(2, 3, 4, 1, 0).reshape(-1, in_channel, out_channel)
         out_ts_tensor = spF.conv3d(ts_tensor, ts_kernel, kernel_size, stride=stride)
         assert out_ts_tensor.stride[0] == stride
 
         # Check target grid is aligned.
-        ts_target_grid_ijk = JaggedTensor([
-            torch.div(out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3], stride, rounding_mode='floor')
-            for b in range(batch_size)])
+        ts_target_grid_ijk = JaggedTensor(
+            [
+                torch.div(out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3], stride, rounding_mode="floor")
+                for b in range(batch_size)
+            ]
+        )
         idx_map = vdb_target_grid.ijk_to_index(ts_target_grid_ijk, cumulative=True)
 
         assert idx_map.jdata.shape[0] == vdb_target_grid.total_voxels
         assert torch.all(torch.sort(idx_map.jdata).values == torch.arange(vdb_target_grid.total_voxels, device=device))
 
         ts_features = out_ts_tensor.feats[vdb_target_grid.ijk_to_inv_index(ts_target_grid_ijk, cumulative=True).jdata]
-        self.assertTrue(torch.allclose(out_vdb_features.jdata, ts_features, atol=0.1),
-                        f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}")
-
-    @test_expand(list(itertools.product(
-        ['cuda'],       # torchsparse supports only cuda.
-        [torch.float16, torch.float32],
-        [1, 4],
-        [2, 3, 4, 5, 6],
-        [1, 2, 4],
-        ["gather_scatter"]
-    )))
+        self.assertTrue(
+            torch.allclose(out_vdb_features.jdata, ts_features, atol=0.1),
+            f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}",
+        )
+
+    @test_expand(
+        list(
+            itertools.product(
+                ["cuda"],  # torchsparse supports only cuda.
+                [torch.float16, torch.float32],
+                [1, 4],
+                [2, 3, 4, 5, 6],
+                [1, 2, 4],
+                ["gather_scatter"],
+            )
+        )
+    )
     def test_conv_vs_torchsparse(self, device, dtype, batch_size, kernel_size, stride, backend):
-        dtype2prec = {
-            torch.float: 1e-5,
-            torch.double: 1e-5,
-            torch.half: 1e-2,
-            torch.bfloat16: 1e-1
-        }
+        dtype2prec = {torch.float: 1e-5, torch.double: 1e-5, torch.half: 1e-2, torch.bfloat16: 1e-1}
         torch.random.manual_seed(0)
         in_channel, out_channel = 4, 8
 
         grid = GridBatch(device=device)
         grid.set_from_points(
             JaggedTensor([torch.randn((100, 3), device=device, dtype=torch.float) for _ in range(batch_size)]),
-            voxel_sizes=0.05, origins=[0.0] * 3)
+            voxel_sizes=0.05,
+            origins=[0.0] * 3,
+        )
 
         vdb_features = grid.jagged_like(torch.randn((grid.total_voxels, in_channel), device=device, dtype=dtype))
-        vdb_kernels = torch.empty((out_channel, in_channel, kernel_size, kernel_size, kernel_size),
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.empty(
+            (out_channel, in_channel, kernel_size, kernel_size, kernel_size), dtype=dtype, device=device
+        )
         torch.nn.init.kaiming_uniform_(vdb_kernels, a=math.sqrt(5))
 
         vdb_features.jdata.requires_grad = True
@@ -286,7 +297,8 @@ def test_conv_vs_torchsparse(self, device, dtype, batch_size, kernel_size, strid
         # torchsparse convolution & backward (kernel arrangement is weird...)
         #   ref: torchsparse@2.0.0b/nn/utils/kernel.py
         ts_tensor = torchsparse_20.SparseTensor(
-            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1))
+            vdb_features.jdata, torch.cat([grid.ijk.jdata, grid.ijk.jidx[:, None]], dim=1)
+        )
         if kernel_size % 2 == 0:
             ts_kernel = vdb_kernels.permute(4, 3, 2, 1, 0).reshape(-1, in_channel, out_channel)
         else:
@@ -295,10 +307,12 @@ def test_conv_vs_torchsparse(self, device, dtype, batch_size, kernel_size, strid
         assert out_ts_tensor.stride[0] == stride
 
         # Check target grid is aligned.
-        ts_target_grid_ijk = JaggedTensor([
-            torch.div(out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3], stride, rounding_mode='floor')
-            for b in range(batch_size)
-        ])
+        ts_target_grid_ijk = JaggedTensor(
+            [
+                torch.div(out_ts_tensor.coords[out_ts_tensor.coords[:, -1] == b, :3], stride, rounding_mode="floor")
+                for b in range(batch_size)
+            ]
+        )
         idx_map = vdb_target_grid.ijk_to_index(ts_target_grid_ijk, cumulative=True)
 
         # (Optionally: visualize)
@@ -322,26 +336,33 @@ def test_conv_vs_torchsparse(self, device, dtype, batch_size, kernel_size, strid
         dense_features_grad = torch.clone(vdb_features.jdata.grad)
         dense_kernels_grad = torch.clone(vdb_kernels.grad)
 
-        self.assertTrue(torch.allclose(out_vdb_features.jdata, ts_features,
-                                       atol=dtype2prec[dtype], rtol=0.),
-                        f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, dense_features_grad,
-                                       atol=dtype2prec[dtype], rtol=0.),
-                        f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, dense_kernels_grad,
-                                       atol=dtype2prec[dtype], rtol=dtype2prec[dtype]),
-                        f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}")
-
-    @test_expand(list(itertools.product(
-        ['cpu', 'cuda'],
-        [torch.bfloat16, torch.float16, torch.float32, torch.float64],
-        [2, 3, 4, 5, 6],
-        [1, 2, 4],
-        [False, True],
-        ["gather_scatter", "igemm", "igemm_sorted"]
-    )))
+        self.assertTrue(
+            torch.allclose(out_vdb_features.jdata, ts_features, atol=dtype2prec[dtype], rtol=0.0),
+            f"Max dist is {torch.max(out_vdb_features.jdata - ts_features)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_features_grad, dense_features_grad, atol=dtype2prec[dtype], rtol=0.0),
+            f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_kernels_grad, dense_kernels_grad, atol=dtype2prec[dtype], rtol=dtype2prec[dtype]),
+            f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}",
+        )
+
+    @test_expand(
+        list(
+            itertools.product(
+                ["cpu", "cuda"],
+                [torch.bfloat16, torch.float16, torch.float32, torch.float64],
+                [2, 3, 4, 5, 6],
+                [1, 2, 4],
+                [False, True],
+                ["gather_scatter", "igemm", "igemm_sorted"],
+            )
+        )
+    )
     def test_torch_sparse_conv(self, device, dtype, kernel_size, stride, mutable, backend):
-        if device == 'cpu' and dtype == torch.bfloat16:
+        if device == "cpu" and dtype == torch.bfloat16:
             return
 
         torch.random.manual_seed(0)
@@ -349,13 +370,7 @@ def test_torch_sparse_conv(self, device, dtype, kernel_size, stride, mutable, ba
         grid.set_from_dense_grid(1, (32, 32, 32))
         tol = {}
         tol_grad = {}
-        dtype2prec = {
-            torch.float: 1e-4,
-            torch.double: 1e-5,
-            torch.half: 1e-2,
-            torch.bfloat16: 1e-1
-        }
-
+        dtype2prec = {torch.float: 1e-4, torch.double: 1e-5, torch.half: 1e-2, torch.bfloat16: 1e-1}
 
         if dtype == torch.float32:
             torch.backends.cuda.matmul.allow_tf32 = False
@@ -366,8 +381,7 @@ def test_torch_sparse_conv(self, device, dtype, kernel_size, stride, mutable, ba
                 return
 
         vdb_features = torch.randn((int(grid.num_voxels[0].item()), 4), device=device, dtype=dtype)
-        vdb_kernels = torch.empty(8, 4, kernel_size, kernel_size, kernel_size,
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.empty(8, 4, kernel_size, kernel_size, kernel_size, dtype=dtype, device=device)
         torch.nn.init.kaiming_uniform_(vdb_kernels, a=math.sqrt(5))
 
         vdb_features.requires_grad = True
@@ -404,33 +418,46 @@ def test_torch_sparse_conv(self, device, dtype, kernel_size, stride, mutable, ba
 
         vdb_features_grad = torch.clone(vdb_features.grad)
         vdb_kernels_grad = torch.clone(vdb_kernels.grad)
-        if backend == 'gather_scatter' and dtype == torch.float16:
-            mult = 10.
+        if backend == "gather_scatter" and dtype == torch.float16:
+            mult = 10.0
         else:
-            mult = 1.
-        self.assertTrue(torch.allclose(out_dense_features, out_dense_features_ref,
-                                       atol=dtype2prec[dtype] * mult, rtol=0.), # type: ignore
-                        f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, dense_features_grad,
-                                       atol=dtype2prec[dtype], rtol=dtype2prec[dtype]),  # type: ignore
-                        f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, dense_kernels_grad,
-                                       atol=dtype2prec[dtype] * 10., rtol=dtype2prec[dtype]),  # type: ignore
-                        f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}")
+            mult = 1.0
+        self.assertTrue(
+            torch.allclose(
+                out_dense_features, out_dense_features_ref, atol=dtype2prec[dtype] * mult, rtol=0.0
+            ),  # type: ignore
+            f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}",
+        )
+        self.assertTrue(
+            torch.allclose(
+                vdb_features_grad, dense_features_grad, atol=dtype2prec[dtype], rtol=dtype2prec[dtype]
+            ),  # type: ignore
+            f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(
+                vdb_kernels_grad, dense_kernels_grad, atol=dtype2prec[dtype] * 10.0, rtol=dtype2prec[dtype]
+            ),  # type: ignore
+            f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}",
+        )
 
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
 
-    @test_expand(list(itertools.product(
-        ['cpu', 'cuda'],
-        [torch.float16, torch.float32, torch.float64],
-        [ (3,3,3), (1,3,1), (1,3,3), (4,1,4), (5,3,1)],
-        [ (1,1,1), (2,2,2), (3,3,3), (1,2,1), (1,3,2)],
-        [False ,True],
-        ["gather_scatter", "igemm", "igemm_sorted"]
-    )))
-    def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size:tuple, stride:tuple, mutable, backend):
-        if (device == 'cpu' or backend == "gather_scatter") and dtype == torch.float16:
+    @test_expand(
+        list(
+            itertools.product(
+                ["cpu", "cuda"],
+                [torch.float16, torch.float32, torch.float64],
+                [(3, 3, 3), (1, 3, 1), (1, 3, 3), (4, 1, 4), (5, 3, 1)],
+                [(1, 1, 1), (2, 2, 2), (3, 3, 3), (1, 2, 1), (1, 3, 2)],
+                [False, True],
+                ["gather_scatter", "igemm", "igemm_sorted"],
+            )
+        )
+    )
+    def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size: tuple, stride: tuple, mutable, backend):
+        if (device == "cpu" or backend == "gather_scatter") and dtype == torch.float16:
             return
 
         torch.random.manual_seed(0)
@@ -439,12 +466,12 @@ def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size:tuple, stride:
         tol = {}
         tol_grad = {}
         if dtype == torch.float16:
-            tol = {'atol': 1e-4, 'rtol': 1e-3}
-            tol_grad = {'atol': 1e-1, 'rtol': 1e-1}
+            tol = {"atol": 1e-4, "rtol": 1e-3}
+            tol_grad = {"atol": 1e-1, "rtol": 1e-1}
 
         elif dtype == torch.float32:
-            tol = {'atol': 1e-4}
-            tol_grad = {'atol': 1e-3}
+            tol = {"atol": 1e-4}
+            tol_grad = {"atol": 1e-3}
             torch.backends.cuda.matmul.allow_tf32 = False
             torch.backends.cudnn.allow_tf32 = False
 
@@ -453,8 +480,7 @@ def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size:tuple, stride:
                 return
 
         vdb_features = torch.randn((int(grid.num_voxels[0].item()), 4), device=device, dtype=dtype)
-        vdb_kernels = torch.randn(8, 4, kernel_size[0], kernel_size[1], kernel_size[2],
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.randn(8, 4, kernel_size[0], kernel_size[1], kernel_size[2], dtype=dtype, device=device)
 
         vdb_features.requires_grad = True
         vdb_kernels.requires_grad = True
@@ -462,7 +488,9 @@ def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size:tuple, stride:
         # Dense convolution & backward
         dense_features = grid.read_into_dense(vdb_features).squeeze(0).permute(3, 2, 1, 0)
         out_dense_features_ref = torch.nn.functional.conv3d(
-            dense_features, vdb_kernels, padding=[(k-1)//2 for k in kernel_size],
+            dense_features,
+            vdb_kernels,
+            padding=[(k - 1) // 2 for k in kernel_size],
             stride=stride,
         )
         out_grad = torch.rand_like(out_dense_features_ref)
@@ -481,53 +509,64 @@ def test_torch_sparse_aniso_conv(self, device, dtype, kernel_size:tuple, stride:
         out_vdb_features = spconv.sparse_conv_3d(vdb_features, vdb_kernels, symbol)
         out_dense_features = target_grid.read_into_dense(out_vdb_features).squeeze(0).permute(3, 2, 1, 0)
 
-
-        for i, (dense_features_dim, ref_features_dim) in enumerate(zip(out_dense_features.shape[-3:], out_dense_features_ref.shape[-3:])):
+        for i, (dense_features_dim, ref_features_dim) in enumerate(
+            zip(out_dense_features.shape[-3:], out_dense_features_ref.shape[-3:])
+        ):
             if dense_features_dim != ref_features_dim:
                 delta = dense_features_dim - ref_features_dim
                 if delta > 0:
-                    if kernel_size[i]%2==1:
-                        l_delta = delta//2 if delta//2>0 else None
-                        r_delta = (delta//2)+(delta%2)
+                    if kernel_size[i] % 2 == 1:
+                        l_delta = delta // 2 if delta // 2 > 0 else None
+                        r_delta = (delta // 2) + (delta % 2)
                     else:
                         l_delta = None
                         r_delta = delta
-                    out_dense_features = out_dense_features[:,
-                                                            l_delta if i==0 else None:-r_delta if i==0 else None,
-                                                            l_delta if i==1 else None:-r_delta if i==1 else None,
-                                                            l_delta if i==2 else None:-r_delta if i==2 else None]
+                    out_dense_features = out_dense_features[
+                        :,
+                        l_delta if i == 0 else None : -r_delta if i == 0 else None,
+                        l_delta if i == 1 else None : -r_delta if i == 1 else None,
+                        l_delta if i == 2 else None : -r_delta if i == 2 else None,
+                    ]
                 elif delta < 0:
                     # replace the i'th dimension with -delta/2:delta/2
-                    l_delta = (delta//2)+(delta%2)
-                    r_delta = delta//2
-                    out_dense_features_ref = out_dense_features_ref[:,
-                                                            -l_delta if i==0 else None:r_delta if i==0 else None,
-                                                            -l_delta if i==1 else None:r_delta if i==1 else None,
-                                                            -l_delta if i==2 else None:r_delta if i==2 else None]
+                    l_delta = (delta // 2) + (delta % 2)
+                    r_delta = delta // 2
+                    out_dense_features_ref = out_dense_features_ref[
+                        :,
+                        -l_delta if i == 0 else None : r_delta if i == 0 else None,
+                        -l_delta if i == 1 else None : r_delta if i == 1 else None,
+                        -l_delta if i == 2 else None : r_delta if i == 2 else None,
+                    ]
 
         out_dense_features.backward(out_grad)
 
         vdb_features_grad = torch.clone(vdb_features.grad)
         vdb_kernels_grad = torch.clone(vdb_kernels.grad)
 
-        self.assertTrue(torch.allclose(out_dense_features, out_dense_features_ref, **tol), # type: ignore
-                        f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, dense_features_grad, **tol_grad),  # type: ignore
-                        f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, dense_kernels_grad, **tol_grad),  # type: ignore
-                        f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}")
+        self.assertTrue(
+            torch.allclose(out_dense_features, out_dense_features_ref, **tol),  # type: ignore
+            f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_features_grad, dense_features_grad, **tol_grad),  # type: ignore
+            f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_kernels_grad, dense_kernels_grad, **tol_grad),  # type: ignore
+            f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}",
+        )
 
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
 
-    @parameterized.expand([
-        [*l, kernel_size, stride] for l, kernel_size, stride in itertools.product(
-        all_device_dtype_combos,
-        [2, 3, 4, 5, 6],
-        [1]
-    )])
+    @parameterized.expand(
+        [
+            [*l, kernel_size, stride]
+            for l, kernel_size, stride in itertools.product(all_device_dtype_combos, [2, 3, 4, 5, 6], [1])
+        ]
+    )
     def test_torch_transposed_sparse_conv(self, device, dtype, mutable, backend, kernel_size, stride):
-        if (device == 'cpu' or backend == "gather_scatter") and dtype == torch.float16:
+        if (device == "cpu" or backend == "gather_scatter") and dtype == torch.float16:
             return
 
         torch.random.manual_seed(0)
@@ -537,17 +576,17 @@ def test_torch_transposed_sparse_conv(self, device, dtype, mutable, backend, ker
         tol_grad = {}
         if dtype == torch.float16:
             if kernel_size > 4:
-                tol = {'atol': 1e-3, 'rtol': 1e-2}
-                tol_grad = {'atol': 1e-1, 'rtol': 1e-1}
+                tol = {"atol": 1e-3, "rtol": 1e-2}
+                tol_grad = {"atol": 1e-1, "rtol": 1e-1}
             else:
-                tol = {'atol': 1e-1, 'rtol': 1e-2}
-                tol_grad = {'atol': 1e-1, 'rtol': 1e-1}
+                tol = {"atol": 1e-1, "rtol": 1e-2}
+                tol_grad = {"atol": 1e-1, "rtol": 1e-1}
         elif dtype == torch.bfloat16:
-                tol = {'atol': 1e-1, 'rtol': 1e-1}
-                tol_grad = {'atol': 1e-1, 'rtol': 1e-1}
+            tol = {"atol": 1e-1, "rtol": 1e-1}
+            tol_grad = {"atol": 1e-1, "rtol": 1e-1}
         elif dtype == torch.float32:
-            tol = {'atol': 1e-4}
-            tol_grad = {'atol': 1e-3}
+            tol = {"atol": 1e-4}
+            tol_grad = {"atol": 1e-3}
             torch.backends.cuda.matmul.allow_tf32 = False
             torch.backends.cudnn.allow_tf32 = False
 
@@ -555,13 +594,12 @@ def test_torch_transposed_sparse_conv(self, device, dtype, mutable, backend, ker
             if "igemm" in backend:
                 return
 
-        if kernel_size > 3 and "sorted" in backend:     # Not supported by fast kernel.
+        if kernel_size > 3 and "sorted" in backend:  # Not supported by fast kernel.
             return
 
         kmap, target_grid, symbol = build_spconv(source_grid, kernel_size, stride, backend)
         vdb_features = torch.randn((int(kmap.target_grid.num_voxels[0]), 4), device=device, dtype=dtype)
-        vdb_kernels = torch.empty((4, 8, kernel_size, kernel_size, kernel_size),
-                                  dtype=dtype, device=device)
+        vdb_kernels = torch.empty((4, 8, kernel_size, kernel_size, kernel_size), dtype=dtype, device=device)
         torch.nn.init.kaiming_uniform_(vdb_kernels, a=math.sqrt(5))
 
         vdb_features.requires_grad = True
@@ -603,12 +641,18 @@ def test_torch_transposed_sparse_conv(self, device, dtype, mutable, backend, ker
         dense_kernels_grad = torch.clone(vdb_kernels.grad)
 
         diff_idxs = torch.where(~torch.isclose(out_dense_features, out_dense_features_ref, **tol))
-        self.assertTrue(torch.allclose(out_dense_features, out_dense_features_ref, **tol),  # type: ignore
-                        f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}")
-        self.assertTrue(torch.allclose(vdb_features_grad, dense_features_grad, **tol_grad),  # type: ignore
-                        f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}")
-        self.assertTrue(torch.allclose(vdb_kernels_grad, dense_kernels_grad, **tol_grad),  # type: ignore
-                        f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}")
+        self.assertTrue(
+            torch.allclose(out_dense_features, out_dense_features_ref, **tol),  # type: ignore
+            f"Max dist is {torch.max(out_dense_features - out_dense_features_ref)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_features_grad, dense_features_grad, **tol_grad),  # type: ignore
+            f"Max dist is {torch.max(vdb_features_grad - dense_features_grad)}",
+        )
+        self.assertTrue(
+            torch.allclose(vdb_kernels_grad, dense_kernels_grad, **tol_grad),  # type: ignore
+            f"Max dist is {torch.max(vdb_kernels_grad - dense_kernels_grad)}",
+        )
         torch.backends.cuda.matmul.allow_tf32 = True
         torch.backends.cudnn.allow_tf32 = True
 
@@ -618,7 +662,9 @@ def test_error_inputs(self, device, dtype, mutable, backend):
         torch.random.manual_seed(0)
 
         grid = GridBatch(mutable=mutable).to(device)
-        grid.set_from_points(torch.randn(1000, 3, device=device, dtype=torch.float), voxel_sizes=0.025, origins=[0.0, 0.0, 0.0])
+        grid.set_from_points(
+            torch.randn(1000, 3, device=device, dtype=torch.float), voxel_sizes=0.025, origins=[0.0, 0.0, 0.0]
+        )
         random_drop_points_if_mutable(grid)
 
         def do_conv(feats, kernels):
@@ -635,24 +681,22 @@ def do_tranpose_conv(feats, kernels):
             with self.assertRaises(ValueError):
                 spconv(bad_feats, bad_kernels)
 
-
             bad_feats = torch.randn(int(grid.num_voxels[0].item()), device=device, dtype=dtype)
             bad_kernels = torch.rand(8, 0, 5, 5, 5, dtype=dtype, device=device)
             with self.assertRaises(ValueError):
                 spconv(bad_feats, bad_kernels)
 
-
             bad_feats = torch.randn(int(grid.num_voxels[0].item()), 5, device=device, dtype=dtype)
             bad_kernels = torch.rand(8, 4, 5, 5, 5, dtype=dtype, device=device)
             with self.assertRaises(ValueError):
                 spconv(bad_feats, bad_kernels)
 
-
             # Bad number of features
             bad_feats = torch.randn(int(grid.num_voxels[0].item()) + 7, 4, device=device, dtype=dtype)
             bad_kernels = torch.rand(8, 4, 5, 5, 5, dtype=dtype, device=device)
             with self.assertRaises(ValueError):
                 spconv(bad_feats, bad_kernels)
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_dense_interface.py b/fvdb/tests/unit/test_dense_interface.py
index 7a331a035e..00b583a19e 100644
--- a/fvdb/tests/unit/test_dense_interface.py
+++ b/fvdb/tests/unit/test_dense_interface.py
@@ -12,31 +12,32 @@
 from .common import random_drop_points_if_mutable, sparse_grid_from_dense_cube
 
 all_device_dtype_combos = [
-    ['cpu', torch.float16, False],
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-    ['cpu', torch.float16, True],
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cpu", torch.float16, False],
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cpu", torch.float16, True],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
 all_device_combos = [
-    ['cpu'],
-    ['cuda'],
+    ["cpu"],
+    ["cuda"],
 ]
 
 
 class TestUtils(unittest.TestCase):
     @parameterized.expand(all_device_dtype_combos)
     def test_dense(self, device, dtype, mutable):
-        dense_vdb = sparse_grid_from_dense_cube([10, 11, 12], (-2.0, -2.0, -2.0), (1.0, 1.0, 1.0),
-                                                voxel_center=False, mutable=mutable, device=device)
+        dense_vdb = sparse_grid_from_dense_cube(
+            [10, 11, 12], (-2.0, -2.0, -2.0), (1.0, 1.0, 1.0), voxel_center=False, mutable=mutable, device=device
+        )
         self.assertTrue(dense_vdb.total_voxels == 10 * 11 * 12)
 
         vdb_coords = dense_vdb.grid_to_world(dense_vdb.ijk.float()).jdata
@@ -67,15 +68,22 @@ def test_read_from_dense(self, device, dtype, mutable):
         ijk = grid.ijk_enabled.jdata
 
         for _ in range(10):
-            dense_origin = torch.tensor([
-                np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item()) for i in range(3)
-            ]).to(torch.long).to(device)
+            dense_origin = (
+                torch.tensor(
+                    [
+                        np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item())
+                        for i in range(3)
+                    ]
+                )
+                .to(torch.long)
+                .to(device)
+            )
 
             ijk_offset = ijk - dense_origin.unsqueeze(0)
             max_bound = torch.tensor(random_grid.shape[:3], device=device, dtype=torch.long)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < max_bound.unsqueeze(0), dim=-1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < max_bound.unsqueeze(0), dim=-1)
+            )
 
             grid_index = grid.ijk_to_index(ijk).jdata[keep_mask]
             i, j, k = ijk_offset[keep_mask, 0], ijk_offset[keep_mask, 1], ijk_offset[keep_mask, 2]
@@ -103,15 +111,22 @@ def test_read_from_dense_multidim(self, device, dtype, mutable):
         ijk = grid.ijk_enabled.jdata
 
         for _ in range(10):
-            dense_origin = torch.tensor([
-                np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item()) for i in range(3)
-            ]).to(torch.long).to(device)
+            dense_origin = (
+                torch.tensor(
+                    [
+                        np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item())
+                        for i in range(3)
+                    ]
+                )
+                .to(torch.long)
+                .to(device)
+            )
 
             ijk_offset = ijk - dense_origin.unsqueeze(0)
             max_bound = torch.tensor(random_grid.shape[:3], device=device, dtype=torch.long)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < max_bound.unsqueeze(0), dim=1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < max_bound.unsqueeze(0), dim=1)
+            )
 
             grid_index = grid.ijk_to_index(ijk).jdata[keep_mask]
             i, j, k = ijk_offset[keep_mask, 0], ijk_offset[keep_mask, 1], ijk_offset[keep_mask, 2]
@@ -143,15 +158,22 @@ def test_read_from_dense_multidim_grad(self, device, dtype, mutable):
         ijk = grid.ijk_enabled.jdata
 
         for _ in range(10):
-            dense_origin = torch.tensor([
-                np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item()) for i in range(3)
-            ]).to(torch.long).to(device)
+            dense_origin = (
+                torch.tensor(
+                    [
+                        np.random.randint(low=ijk.min(0).values[i].item(), high=ijk.max(0).values[i].item())
+                        for i in range(3)
+                    ]
+                )
+                .to(torch.long)
+                .to(device)
+            )
 
             ijk_offset = ijk - dense_origin.unsqueeze(0)
             max_bound = torch.tensor(random_grid.shape[:3], device=device, dtype=torch.long)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < max_bound.unsqueeze(0), dim=1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < max_bound.unsqueeze(0), dim=1)
+            )
 
             grid_index = grid.ijk_to_index(ijk).jdata[keep_mask]
             i, j, k = ijk_offset[keep_mask, 0], ijk_offset[keep_mask, 1], ijk_offset[keep_mask, 2]
@@ -174,7 +196,6 @@ def test_read_from_dense_multidim_grad(self, device, dtype, mutable):
             assert random_grid_copy.grad is not None
             self.assertTrue(torch.equal(random_grid.grad, random_grid_copy.grad))
 
-
     @parameterized.expand(all_device_dtype_combos)
     def test_read_into_dense(self, device, dtype, mutable):
 
@@ -196,16 +217,18 @@ def test_read_into_dense(self, device, dtype, mutable):
         max_crop_size = bbsize + bbsize // 10
         max_crop_coord = min_crop_coord + max_crop_size
         for _ in range(10):
-            crop_min = torch.tensor([np.random.randint(low=min_crop_coord[i].item(),
-                                                       high=max_crop_coord[i].item()) for i in range(3)]).to(device)
-            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(device)
-
+            crop_min = torch.tensor(
+                [np.random.randint(low=min_crop_coord[i].item(), high=max_crop_coord[i].item()) for i in range(3)]
+            ).to(device)
+            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(
+                device
+            )
 
             target_crop = torch.zeros(*crop_size.cpu().numpy(), sparse_data.shape[-1], dtype=dtype, device=device)
             ijk_offset = ijk - crop_min.unsqueeze(0)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1)
+            )
             write_ijk = ijk_offset[keep_mask].contiguous()
             idx = write_ijk[:, 0] * crop_size[1] * crop_size[2] + write_ijk[:, 1] * crop_size[2] + write_ijk[:, 2]
             target_crop.view(-1, sparse_data.shape[-1])[idx] = sparse_data[grid.enabled_mask.jdata][keep_mask]
@@ -235,15 +258,18 @@ def test_read_into_dense_multidim(self, device, dtype, mutable):
         max_crop_size = bbsize + bbsize // 10
         max_crop_coord = min_crop_coord + max_crop_size
         for _ in range(10):
-            crop_min = torch.tensor([np.random.randint(low=min_crop_coord[i].item(),
-                                                       high=max_crop_coord[i].item()) for i in range(3)]).to(device)
-            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(device)
+            crop_min = torch.tensor(
+                [np.random.randint(low=min_crop_coord[i].item(), high=max_crop_coord[i].item()) for i in range(3)]
+            ).to(device)
+            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(
+                device
+            )
 
             target_crop = torch.zeros(*crop_size.cpu().numpy(), *sparse_data.shape[1:], dtype=dtype, device=device)
             ijk_offset = ijk - crop_min.unsqueeze(0)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1)
+            )
             write_ijk = ijk_offset[keep_mask].contiguous()
             idx = write_ijk[:, 0] * crop_size[1] * crop_size[2] + write_ijk[:, 1] * crop_size[2] + write_ijk[:, 2]
             target_crop.view(-1, *sparse_data.shape[1:])[idx] = sparse_data[grid.enabled_mask.jdata][keep_mask]
@@ -252,7 +278,6 @@ def test_read_into_dense_multidim(self, device, dtype, mutable):
 
             self.assertTrue(torch.all(pred_crop == target_crop))
 
-
     @parameterized.expand(all_device_dtype_combos)
     def test_read_into_dense_multidim_grad(self, device, dtype, mutable):
 
@@ -277,15 +302,18 @@ def test_read_into_dense_multidim_grad(self, device, dtype, mutable):
         max_crop_size = bbsize + bbsize // 10
         max_crop_coord = min_crop_coord + max_crop_size
         for _ in range(10):
-            crop_min = torch.tensor([np.random.randint(low=min_crop_coord[i].item(),
-                                                       high=max_crop_coord[i].item()) for i in range(3)]).to(device)
-            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(device)
+            crop_min = torch.tensor(
+                [np.random.randint(low=min_crop_coord[i].item(), high=max_crop_coord[i].item()) for i in range(3)]
+            ).to(device)
+            crop_size = torch.tensor([np.random.randint(low=1, high=max_crop_size[i].item()) for i in range(3)]).to(
+                device
+            )
 
             target_crop = torch.zeros(*crop_size.cpu().numpy(), *sparse_data.shape[1:], dtype=dtype, device=device)
             ijk_offset = ijk - crop_min.unsqueeze(0)
             keep_mask = torch.logical_and(
-                            torch.all(ijk_offset >= 0, dim=-1),
-                            torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1))
+                torch.all(ijk_offset >= 0, dim=-1), torch.all(ijk_offset < crop_size.unsqueeze(0), dim=1)
+            )
             write_ijk = ijk_offset[keep_mask].contiguous()
             idx = write_ijk[:, 0] * crop_size[1] * crop_size[2] + write_ijk[:, 1] * crop_size[2] + write_ijk[:, 2]
             target_crop.view(-1, *sparse_data.shape[1:])[idx] = sparse_data_copy[grid.enabled_mask.jdata][keep_mask]
@@ -293,7 +321,6 @@ def test_read_into_dense_multidim_grad(self, device, dtype, mutable):
             loss_copy = target_crop.sum()
             loss_copy.backward()
 
-
             pred_crop = grid.read_into_dense(sparse_data, crop_min, crop_size).squeeze(0)
             loss = pred_crop.sum()
             loss.backward()
@@ -303,7 +330,6 @@ def test_read_into_dense_multidim_grad(self, device, dtype, mutable):
             self.assertEqual(torch.abs(sparse_data.grad - sparse_data_copy.grad).max().item(), 0.0)
             self.assertTrue(torch.all(pred_crop == target_crop))
 
-
     @parameterized.expand(all_device_combos)
     def test_build_from_dense(self, device):
         gorigin = tuple([int(a.item()) for a in torch.randint(-32, 32, (3,))])
@@ -321,11 +347,14 @@ def test_build_from_dense(self, device):
         self.assertTrue(torch.all(grid_ijk.min(0)[0] == target_min_ijk))
         self.assertTrue(torch.all(grid_ijk.max(0)[0] == target_max_ijk))
 
-        ijk_mask = torch.stack([
-            torch.randint(0, gsize[0], (100,), device=device),
-            torch.randint(0, gsize[1], (100,), device=device),
-            torch.randint(0, gsize[2], (100,), device=device),
-        ], dim=-1)
+        ijk_mask = torch.stack(
+            [
+                torch.randint(0, gsize[0], (100,), device=device),
+                torch.randint(0, gsize[1], (100,), device=device),
+                torch.randint(0, gsize[2], (100,), device=device),
+            ],
+            dim=-1,
+        )
         dense_mask = torch.zeros(*gsize, dtype=torch.bool).to(device)
         mask_coord_set = set()
         for idx in range(ijk_mask.shape[0]):
@@ -347,5 +376,5 @@ def test_build_from_dense(self, device):
         self.assertEqual(pred_set, mask_coord_set)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_dual.py b/fvdb/tests/unit/test_dual.py
index bb76e0bd95..b3b811c565 100644
--- a/fvdb/tests/unit/test_dual.py
+++ b/fvdb/tests/unit/test_dual.py
@@ -10,10 +10,11 @@
 from fvdb import GridBatch
 
 all_device_combos = [
-    ['cpu', False],
-    ['cuda', False],
+    ["cpu", False],
+    ["cuda", False],
 ]
 
+
 class TestBasicOps(unittest.TestCase):
     def setUp(self):
         pass
@@ -51,13 +52,14 @@ def test_world_to_dual(self, device, mutable):
         # o x x x x
         # x x x x x
 
-        ij = torch.tensor(
-            [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [2, 2], [3, 0]], device=device
+        ij = torch.tensor([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [2, 2], [3, 0]], device=device)
+        ijk = torch.cat(
+            [
+                torch.nn.functional.pad(ij, (0, 1), mode="constant", value=0),
+                torch.nn.functional.pad(ij, (0, 1), mode="constant", value=1),
+            ],
+            dim=0,
         )
-        ijk = torch.cat([
-            torch.nn.functional.pad(ij, (0, 1), mode='constant', value=0),
-            torch.nn.functional.pad(ij, (0, 1), mode='constant', value=1),
-        ], dim=0)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_ijk(ijk)
 
@@ -66,12 +68,31 @@ def test_world_to_dual(self, device, mutable):
         assert (_grid.ijk.jdata == _target_ijk).all(), _grid.ijk.jdata
 
         _grid = grid.dual_grid().dual_grid(exclude_border=True)
-        _target_ijk = torch.tensor([
-            [0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0], [1, 0, 1],
-            [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
-            [2, 0, 0], [2, 0, 1], [2, 1, 0], [2, 1, 1], [2, 2, 0], [2, 2, 1], [3, 0, 0], [3, 0, 1]], device=device)
+        _target_ijk = torch.tensor(
+            [
+                [0, 0, 0],
+                [0, 0, 1],
+                [0, 1, 0],
+                [0, 1, 1],
+                [1, 0, 0],
+                [1, 0, 1],
+                [1, 1, 0],
+                [1, 1, 1],
+                [1, 2, 0],
+                [1, 2, 1],
+                [2, 0, 0],
+                [2, 0, 1],
+                [2, 1, 0],
+                [2, 1, 1],
+                [2, 2, 0],
+                [2, 2, 1],
+                [3, 0, 0],
+                [3, 0, 1],
+            ],
+            device=device,
+        )
         assert (_grid.ijk.jdata == _target_ijk).all(), _grid.ijk.jdata
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_empty_grids.py b/fvdb/tests/unit/test_empty_grids.py
index 21abe53b87..67c2efe6d9 100644
--- a/fvdb/tests/unit/test_empty_grids.py
+++ b/fvdb/tests/unit/test_empty_grids.py
@@ -15,22 +15,22 @@
 from fvdb import GridBatch, sparse_grid_from_ijk, JaggedTensor
 from fvdb.utils import volume_render
 
-from .common import (random_drop_points_if_mutable)
+from .common import random_drop_points_if_mutable
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
+
 def dtype_to_atol(dtype: torch.dtype) -> float:
     if dtype == torch.float16:
         return 1e-1
@@ -50,26 +50,22 @@ def setUp(self):
     @parameterized.expand(all_device_dtype_combos)
     def test_building_empty_grids_from_ijk(self, device, dtype, mutable):
         batch_size = 1
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (0, 3)) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (0, 3)) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
         random_drop_points_if_mutable(grid, 0.5)
         self.assertEqual(len(grid), batch_size)
         self.assertEqual(grid.joffsets[0].item(), 0)
         self.assertEqual(grid.joffsets[1].item(), 0)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (0, 2)) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.randint(-512, 512, (0, 2)) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_ijk(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (0,)) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.randint(-512, 512, (0,)) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_ijk(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (5, 0)) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.randint(-512, 512, (5, 0)) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_ijk(grid_ijk_bad, mutable=mutable)
 
@@ -77,8 +73,7 @@ def test_building_empty_grids_from_ijk(self, device, dtype, mutable):
     def test_building_grid_with_one_empty_element_in_jagged_tensor(self, device, dtype, mutable):
         shapes = [512, 0, 128]
         batch_size = len(shapes)
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (shapes[i], 3)) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (shapes[i], 3)) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
         random_drop_points_if_mutable(grid, 0.5)
         self.assertEqual(len(grid), batch_size)
@@ -87,58 +82,50 @@ def test_building_grid_with_one_empty_element_in_jagged_tensor(self, device, dty
         for i in range(batch_size):
             self.assertEqual(grid.num_voxels[i], shapes[i])
             self.assertEqual(grid.joffsets[i].item(), off)
-            self.assertEqual(grid.joffsets[i+1].item(), off + shapes[i])
+            self.assertEqual(grid.joffsets[i + 1].item(), off + shapes[i])
             off += shapes[i]
 
     @parameterized.expand(all_device_dtype_combos)
     def test_building_empty_grids_from_points(self, device, dtype, mutable):
         batch_size = 1
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.rand(0, 3) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.rand(0, 3) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_points(grid_ijk, mutable=mutable)
         random_drop_points_if_mutable(grid, 0.5)
         self.assertEqual(len(grid), batch_size)
         self.assertEqual(grid.joffsets[0].item(), 0)
         self.assertEqual(grid.joffsets[1].item(), 0)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(0, 2) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(0, 2) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_points(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(0) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(0) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_points(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(5, 0) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(5, 0) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_points(grid_ijk_bad, mutable=mutable)
 
     @parameterized.expand(all_device_dtype_combos)
     def test_building_empty_grids_from_nearest_points(self, device, dtype, mutable):
         batch_size = 1
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.rand(0, 3) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.rand(0, 3) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_nearest_voxels_to_points(grid_ijk, mutable=mutable)
         random_drop_points_if_mutable(grid, 0.5)
         self.assertEqual(len(grid), batch_size)
         self.assertEqual(grid.joffsets[0].item(), 0)
         self.assertEqual(grid.joffsets[1].item(), 0)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(0, 2) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(0, 2) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_nearest_voxels_to_points(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(0) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(0) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_nearest_voxels_to_points(grid_ijk_bad, mutable=mutable)
 
-        grid_ijk_bad = fvdb.JaggedTensor(
-            [torch.rand(5, 0) for i in range(batch_size)]).to(device)
+        grid_ijk_bad = fvdb.JaggedTensor([torch.rand(5, 0) for i in range(batch_size)]).to(device)
         with self.assertRaises(ValueError):
             fvdb.sparse_grid_from_nearest_voxels_to_points(grid_ijk_bad, mutable=mutable)
 
@@ -146,8 +133,9 @@ def test_building_empty_grids_from_nearest_points(self, device, dtype, mutable):
     def test_fvdb_cat(self, device, dtype, mutable):
         def _make_random_grid(batch_size):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             return fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
 
         # Test concat batches with 1 grid
@@ -186,8 +174,10 @@ def _make_random_grid(batch_size):
         random_drop_points_if_mutable(grid3, 0.5)
         grid_cat = fvdb.jcat([grid1, grid2, grid3])
         random_drop_points_if_mutable(grid_cat, 0.5)
-        self.assertTrue(torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) +  len(grid3))
+        self.assertTrue(
+            torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata]))
+        )
+        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) + len(grid3))
         self.assertEqual(len(grid_cat), 8)
 
         # Test concat 3 grids
@@ -197,18 +187,34 @@ def _make_random_grid(batch_size):
         random_drop_points_if_mutable(grid3, 0.5)
         grid_cat = fvdb.jcat([grid1, grid2, grid3])
         random_drop_points_if_mutable(grid_cat, 0.5)
-        self.assertTrue(torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) +  len(grid3))
+        self.assertTrue(
+            torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata]))
+        )
+        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) + len(grid3))
         self.assertEqual(len(grid_cat), 11)
 
         # Cat to the cat /ᐠ - ˕ -マ < Meow
         grid4, grid5 = _make_random_grid(2), _make_random_grid(5)
         grid_cat2 = fvdb.jcat([grid_cat, grid1, grid4, grid5, grid3])
-        self.assertTrue(torch.allclose(grid_cat2.ijk.jdata,
-                                       torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata,
-                                                  grid1.ijk.jdata, grid4.ijk.jdata, grid5.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat2), len(grid1) + len(grid2) +  len(grid3) + len(grid1) + \
-                         len(grid4) + len(grid5) + len(grid3))
+        self.assertTrue(
+            torch.allclose(
+                grid_cat2.ijk.jdata,
+                torch.cat(
+                    [
+                        grid1.ijk.jdata,
+                        grid2.ijk.jdata,
+                        grid3.ijk.jdata,
+                        grid1.ijk.jdata,
+                        grid4.ijk.jdata,
+                        grid5.ijk.jdata,
+                        grid3.ijk.jdata,
+                    ]
+                ),
+            )
+        )
+        self.assertEqual(
+            len(grid_cat2), len(grid1) + len(grid2) + len(grid3) + len(grid1) + len(grid4) + len(grid5) + len(grid3)
+        )
         self.assertEqual(len(grid_cat2), 25)
 
     @parameterized.expand(all_device_dtype_combos)
@@ -216,14 +222,16 @@ def test_fvdb_cat_empty_grid(self, device, dtype, mutable):
 
         def _make_random_grid(batch_size):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             return fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
 
         def _make_empty_grid(batch_size):
             sizes = [0 for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             return fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
 
         # Test concat batches with 1 grid
@@ -262,8 +270,10 @@ def _make_empty_grid(batch_size):
         random_drop_points_if_mutable(grid3, 0.5)
         grid_cat = fvdb.jcat([grid1, grid2, grid3])
         random_drop_points_if_mutable(grid_cat, 0.5)
-        self.assertTrue(torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) +  len(grid3))
+        self.assertTrue(
+            torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata]))
+        )
+        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) + len(grid3))
         self.assertEqual(len(grid_cat), 8)
 
         # Test concat 3 grids
@@ -273,20 +283,36 @@ def _make_empty_grid(batch_size):
         random_drop_points_if_mutable(grid3, 0.5)
         grid_cat = fvdb.jcat([grid1, grid2, grid3])
         random_drop_points_if_mutable(grid_cat, 0.5)
-        self.assertTrue(torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) +  len(grid3))
+        self.assertTrue(
+            torch.allclose(grid_cat.ijk.jdata, torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata]))
+        )
+        self.assertEqual(len(grid_cat), len(grid1) + len(grid2) + len(grid3))
         self.assertEqual(len(grid_cat), 11)
 
         # Cat to the cat /ᐠ - ˕ -マ < Meow
         grid4, grid5 = _make_empty_grid(2), _make_random_grid(5)
         grid_cat2 = fvdb.jcat([grid_cat, grid1, grid4, grid5, grid3])
-        self.assertTrue(torch.allclose(grid_cat2.ijk.jdata,
-                                       torch.cat([grid1.ijk.jdata, grid2.ijk.jdata, grid3.ijk.jdata,
-                                                  grid1.ijk.jdata, grid4.ijk.jdata, grid5.ijk.jdata, grid3.ijk.jdata])))
-        self.assertEqual(len(grid_cat2), len(grid1) + len(grid2) +  len(grid3) + len(grid1) + \
-                         len(grid4) + len(grid5) + len(grid3))
+        self.assertTrue(
+            torch.allclose(
+                grid_cat2.ijk.jdata,
+                torch.cat(
+                    [
+                        grid1.ijk.jdata,
+                        grid2.ijk.jdata,
+                        grid3.ijk.jdata,
+                        grid1.ijk.jdata,
+                        grid4.ijk.jdata,
+                        grid5.ijk.jdata,
+                        grid3.ijk.jdata,
+                    ]
+                ),
+            )
+        )
+        self.assertEqual(
+            len(grid_cat2), len(grid1) + len(grid2) + len(grid3) + len(grid1) + len(grid4) + len(grid5) + len(grid3)
+        )
         self.assertEqual(len(grid_cat2), 25)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_io.py b/fvdb/tests/unit/test_io.py
index 367134b0d3..88abcd2f8c 100644
--- a/fvdb/tests/unit/test_io.py
+++ b/fvdb/tests/unit/test_io.py
@@ -46,13 +46,16 @@
 
 all_dtypes_and_dims = standard_dtypes_and_dims + other_dtypes_and_dims
 
-all_names_compressed_devices_dtypes_and_dims = list(itertools.product(
-    [1, 2, 4],        # Batch size
-    [True, False],    # Include names or not
-    [True, False],    # Compressed
-    ['cpu', 'cuda'],  # Device
-    [True, False],    # Mutable
-    all_dtypes_and_dims))
+all_names_compressed_devices_dtypes_and_dims = list(
+    itertools.product(
+        [1, 2, 4],  # Batch size
+        [True, False],  # Include names or not
+        [True, False],  # Compressed
+        ["cpu", "cuda"],  # Device
+        [True, False],  # Mutable
+        all_dtypes_and_dims,
+    )
+)
 
 
 class TestIO(unittest.TestCase):
@@ -68,18 +71,16 @@ def test_save_and_load(self, batch_size, include_names, compressed, device, muta
 
         names = None
         if include_names:
-            names = [f'grid-{i}' for i in range(batch_size)]
+            names = [f"grid-{i}" for i in range(batch_size)]
             if batch_size == 1 and np.random.rand() > 0.5:
                 names = names[0]
 
         sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_ijk(grid_ijk, mutable=mutable)
         random_drop_points_if_mutable(grid, 0.5)
         sizes = [[int(grid.num_voxels[i].item())] + dim for i in range(batch_size)]
-        data = fvdb.JaggedTensor(
-            [(torch.rand(*sizes[i], device=device) * 256).to(dtype) for i in range(batch_size)])
+        data = fvdb.JaggedTensor([(torch.rand(*sizes[i], device=device) * 256).to(dtype) for i in range(batch_size)])
 
         with tempfile.NamedTemporaryFile() as temp:
             fvdb.save(temp.name, grid, data, names=names, compressed=compressed)
@@ -88,13 +89,13 @@ def test_save_and_load(self, batch_size, include_names, compressed, device, muta
             if isinstance(names, str):
                 names = [names] * batch_size
             elif names is None:
-                names = [''] * batch_size
+                names = [""] * batch_size
 
             self.assertTrue(names == names2, f"{names}, {names2}")
 
             # NOTE: (@fwilliams) For some reason, when we build on the GPU, things get reordered when we load.
             # This is due to a bug in NanoVDB. We can't test for equality, but we can test for equivalence.
-            if device == 'cuda':
+            if device == "cuda":
                 for bi in range(batch_size):
                     grid_ijk_i = grid.ijk[bi].jdata
                     grid2_ijk_i = grid2.ijk[bi].jdata
@@ -102,10 +103,16 @@ def test_save_and_load(self, batch_size, include_names, compressed, device, muta
                     grid2_data_i = data2[bi].jdata.unsqueeze(-1) if data2.jdata.ndim == 1 else data2[bi].jdata
 
                     grid_ijk_i_list = [tuple(grid_ijk_i[i].cpu().numpy().tolist()) for i in range(grid_ijk_i.shape[0])]
-                    grid2_ijk_i_list = [tuple(grid2_ijk_i[i].cpu().numpy().tolist()) for i in range(grid2_data_i.shape[0])]
+                    grid2_ijk_i_list = [
+                        tuple(grid2_ijk_i[i].cpu().numpy().tolist()) for i in range(grid2_data_i.shape[0])
+                    ]
 
-                    grid_data_i_list = [tuple(grid_data_i[i].view(-1).cpu().numpy().tolist()) for i in range(grid_data_i.shape[0])]
-                    grid2_data_i_list = [tuple(grid_data_i[i].view(-1).cpu().numpy().tolist()) for i in range(grid2_data_i.shape[0])]
+                    grid_data_i_list = [
+                        tuple(grid_data_i[i].view(-1).cpu().numpy().tolist()) for i in range(grid_data_i.shape[0])
+                    ]
+                    grid2_data_i_list = [
+                        tuple(grid_data_i[i].view(-1).cpu().numpy().tolist()) for i in range(grid2_data_i.shape[0])
+                    ]
 
                     grid_i_dict = dict(zip(grid_ijk_i_list, grid_data_i_list))
                     grid2_i_dict = dict(zip(grid2_ijk_i_list, grid2_data_i_list))
@@ -120,8 +127,7 @@ def test_save_and_load(self, batch_size, include_names, compressed, device, muta
                 self.assertTrue(torch.all(data.jdata == data2.jdata))
                 self.assertTrue(torch.all(grid.enabled_mask.jdata == grid2.enabled_mask.jdata))
 
-
-    @parameterized.expand(itertools.product(['cuda', 'cpu'], [1, 3]))
+    @parameterized.expand(itertools.product(["cuda", "cpu"], [1, 3]))
     def test_save_and_load_without_data(self, device, batch_size):
         torch.manual_seed(0)
         np.random.seed(0)
@@ -129,11 +135,9 @@ def test_save_and_load_without_data(self, device, batch_size):
         names = None
 
         sizes = [np.random.randint(10, 20) for _ in range(batch_size)]
-        grid_ijk = fvdb.JaggedTensor(
-            [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+        grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
         grid = fvdb.sparse_grid_from_ijk(grid_ijk)
 
-
         with tempfile.NamedTemporaryFile() as temp:
             fvdb.save(temp.name, grid, names=names, compressed=True)
             grid2, data2, names2 = fvdb.load(temp.name, device=device)
@@ -141,7 +145,7 @@ def test_save_and_load_without_data(self, device, batch_size):
             if isinstance(names, str):
                 names = [names] * batch_size
             elif names is None:
-                names = [''] * batch_size
+                names = [""] * batch_size
 
             self.assertTrue(names == names2, f"{names}, {names2}")
 
@@ -149,18 +153,20 @@ def test_save_and_load_without_data(self, device, batch_size):
                 self.assertEqual(data2.jdata.numel(), 0)
                 self.assertTrue(data2.joffsets.shape[0] == (batch_size + 1))
                 self.assertTrue(data2.joffsets[bi].item() == 0)
-                self.assertTrue(data2.joffsets[bi+1].item() == 0)
+                self.assertTrue(data2.joffsets[bi + 1].item() == 0)
                 self.assertTrue(data2.jidx.numel() == 0)
 
             # NOTE: (@fwilliams) For some reason, when we build on the GPU, things get reordered when we load.
             # This is due to a bug in NanoVDB. We can't test for equality, but we can test for equivalence.
-            if device == 'cuda':
+            if device == "cuda":
                 for bi in range(batch_size):
                     grid_ijk_i = grid.ijk[bi].jdata
                     grid2_ijk_i = grid2.ijk[bi].jdata
 
                     grid_ijk_i_list = [tuple(grid_ijk_i[i].cpu().numpy().tolist()) for i in range(grid_ijk_i.shape[0])]
-                    grid2_ijk_i_list = [tuple(grid2_ijk_i[i].cpu().numpy().tolist()) for i in range(grid2_ijk_i.shape[0])]
+                    grid2_ijk_i_list = [
+                        tuple(grid2_ijk_i[i].cpu().numpy().tolist()) for i in range(grid2_ijk_i.shape[0])
+                    ]
                     self.assertTrue(set(grid_ijk_i_list) == set(grid2_ijk_i_list))
 
                     # FIXME: (@fwilliams) -- This will fail because of a bug in NanoVDB
@@ -170,96 +176,110 @@ def test_save_and_load_without_data(self, device, batch_size):
 
     def test_load_basic(self):
         datadir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..", "data")
-        grids, data, names = fvdb.load(os.path.join(datadir, 'batch.nvdb'))
-        grids, data, names = fvdb.load(os.path.join(datadir, 'smoke-blosc.nvdb'))
+        grids, data, names = fvdb.load(os.path.join(datadir, "batch.nvdb"))
+        grids, data, names = fvdb.load(os.path.join(datadir, "smoke-blosc.nvdb"))
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_name_too_long_raises(self, device):
         for batch_size in (1, 3):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             with self.assertRaises(ValueError):
-                fvdb.save('temp.nvdb', grid, grid_ijk, compressed=True, names=['a'*1000]*batch_size)
+                fvdb.save("temp.nvdb", grid, grid_ijk, compressed=True, names=["a" * 1000] * batch_size)
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_bad_length_raises(self, device):
         for batch_size in (1, 4):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             grid_data = fvdb.JaggedTensor(
-                [torch.rand(int(grid.num_voxels[i].item()) + (-1)**(i%2)*2).to(device) for i in range(batch_size)])
+                [torch.rand(int(grid.num_voxels[i].item()) + (-1) ** (i % 2) * 2).to(device) for i in range(batch_size)]
+            )
             if batch_size % 2 == 0:
                 self.assertEqual(grid_data.jdata.shape[0], grid.total_voxels)
             with tempfile.NamedTemporaryFile() as temp:
                 with self.assertRaises(ValueError):
-                    fvdb.save(temp.name, grid, grid_data, compressed=True, names=['a']*batch_size)
+                    fvdb.save(temp.name, grid, grid_data, compressed=True, names=["a"] * batch_size)
 
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             grid_data = fvdb.JaggedTensor(
-                [torch.rand(int(grid.num_voxels[i].item()) + (-1)**(i%2)*2, 3, 3).to(device) for i in range(batch_size)])
+                [
+                    torch.rand(int(grid.num_voxels[i].item()) + (-1) ** (i % 2) * 2, 3, 3).to(device)
+                    for i in range(batch_size)
+                ]
+            )
             if batch_size % 2 == 0:
                 self.assertEqual(grid_data.jdata.shape[0], grid.total_voxels)
             with tempfile.NamedTemporaryFile() as temp:
                 with self.assertRaises(ValueError):
-                    fvdb.save(temp.name, grid, grid_data, compressed=True, names=['a']*batch_size)
+                    fvdb.save(temp.name, grid, grid_data, compressed=True, names=["a"] * batch_size)
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_bad_device_raises(self, device):
-        bad_device = 'cpu' if device == 'cuda' else 'cuda'
+        bad_device = "cpu" if device == "cuda" else "cuda"
         for batch_size in (1, 4):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             grid_data = fvdb.JaggedTensor(
-                [torch.rand(int(grid.num_voxels[i].item())).to(bad_device) for i in range(batch_size)])
+                [torch.rand(int(grid.num_voxels[i].item())).to(bad_device) for i in range(batch_size)]
+            )
             with self.assertRaises(ValueError):
-                fvdb.save('temp.nvdb', grid, grid_data, compressed=True, names=['a']*batch_size)
+                fvdb.save("temp.nvdb", grid, grid_data, compressed=True, names=["a"] * batch_size)
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_bad_names_raises(self, device):
         for batch_size in (1, 4):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             grid_data = fvdb.JaggedTensor(
-                [torch.rand(int(grid.num_voxels[i].item())).to(device) for i in range(batch_size)])
-            names = ['aaa'] * (batch_size + 1)
+                [torch.rand(int(grid.num_voxels[i].item())).to(device) for i in range(batch_size)]
+            )
+            names = ["aaa"] * (batch_size + 1)
             with tempfile.NamedTemporaryFile() as temp:
                 with self.assertRaises(ValueError):
                     fvdb.save(temp.name, grid, grid_data, compressed=True, names=names)
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_nonexistent_name_raises(self, device):
         for batch_size in (1, 3):
             sizes = [np.random.randint(100, 200) for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             # data = fvdb.JaggedTensor([torch.rand(grid.num_voxels[i].item()).to(device) for i in range(batch_size)])
             with tempfile.NamedTemporaryFile() as temp:
-                fvdb.save(temp.name, grid, compressed=True, names=[f'a_{i}' for i in range(batch_size)])
+                fvdb.save(temp.name, grid, compressed=True, names=[f"a_{i}" for i in range(batch_size)])
 
                 with self.assertRaises(IndexError):
-                    fvdb.load(temp.name, device=device, grid_id=['a_0', 'b', 'a_1', 'a_0'])
+                    fvdb.load(temp.name, device=device, grid_id=["a_0", "b", "a_1", "a_0"])
 
                 with self.assertRaises(IndexError):
-                    fvdb.load(temp.name, device=device, grid_id='c')
+                    fvdb.load(temp.name, device=device, grid_id="c")
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_one_voxel_grids(self, device):
         for batch_size in (1, 3):
             sizes = [1 for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             data = fvdb.JaggedTensor([torch.rand(1).squeeze().to(device)] * batch_size)
             with tempfile.NamedTemporaryFile() as temp:
@@ -269,8 +289,9 @@ def test_one_voxel_grids(self, device):
                 self.assertTrue(torch.all(data2.jdata == data.jdata))
 
             sizes = [1 for _ in range(batch_size)]
-            grid_ijk = fvdb.JaggedTensor(
-                [torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(device)
+            grid_ijk = fvdb.JaggedTensor([torch.randint(-512, 512, (sizes[i], 3)) for i in range(batch_size)]).to(
+                device
+            )
             grid = fvdb.sparse_grid_from_ijk(grid_ijk)
             data = fvdb.JaggedTensor([torch.rand(1).unsqueeze(-1).unsqueeze(-1).to(device)] * batch_size)
             with tempfile.NamedTemporaryFile() as temp:
@@ -279,16 +300,21 @@ def test_one_voxel_grids(self, device):
                 self.assertTrue(torch.all(grid2.ijk.jdata == grid.ijk.jdata))
                 self.assertTrue(torch.all(data2.jdata == data.jdata))
 
-    @parameterized.expand(['cpu', 'cuda'])
+    @parameterized.expand(["cpu", "cuda"])
     def test_voxelsize_and_origin(self, device):
         torch.manual_seed(0)
         np.random.seed(0)
 
-        pts = torch.tensor([[1.0, 0.0, 0.0],
-                            [0.0, 1.0, 0.0],], device=device)
-        test_grid = fvdb.sparse_grid_from_points(pts,
-                                                 voxel_sizes=np.random.random()+0.00001,
-                                                 origins=[np.random.randint(-100,100) for _ in range(3)])
+        pts = torch.tensor(
+            [
+                [1.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0],
+            ],
+            device=device,
+        )
+        test_grid = fvdb.sparse_grid_from_points(
+            pts, voxel_sizes=np.random.random() + 0.00001, origins=[np.random.randint(-100, 100) for _ in range(3)]
+        )
 
         with tempfile.NamedTemporaryFile() as temp:
             fvdb.save(temp.name, test_grid)
@@ -297,5 +323,6 @@ def test_voxelsize_and_origin(self, device):
             self.assertTrue(torch.all(test_grid.voxel_sizes == test_grid_from_file.voxel_sizes))
             self.assertTrue(torch.all(test_grid.origins == test_grid_from_file.origins))
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_jagged_tensor.py b/fvdb/tests/unit/test_jagged_tensor.py
index 1644f0da53..d15b26eb78 100644
--- a/fvdb/tests/unit/test_jagged_tensor.py
+++ b/fvdb/tests/unit/test_jagged_tensor.py
@@ -15,20 +15,32 @@
 import fvdb
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16],
-    ['cpu', torch.float32],
-    ['cuda', torch.float32],
-    ['cpu', torch.float64],
-    ['cuda', torch.float64],
+    ["cuda", torch.float16],
+    ["cpu", torch.float32],
+    ["cuda", torch.float32],
+    ["cpu", torch.float64],
+    ["cuda", torch.float64],
 ]
 
 NVOX = 10_000
 
+
 class TestJaggedTensor(unittest.TestCase):
     def setUp(self):
         pass
 
-    def mklol(self, num_outer, num_inner_min, num_inner_max, device, dtype, last_dims=(3, 4), base_num=1000, vary_num=10, empty_prob=0.0):
+    def mklol(
+        self,
+        num_outer,
+        num_inner_min,
+        num_inner_max,
+        device,
+        dtype,
+        last_dims=(3, 4),
+        base_num=1000,
+        vary_num=10,
+        empty_prob=0.0,
+    ):
         pts_list = []
         for _ in range(num_outer):
             pts_list_i = []
@@ -38,7 +50,8 @@ def mklol(self, num_outer, num_inner_min, num_inner_max, device, dtype, last_dim
                     size = 0
                 pts_list_i = [
                     torch.rand(size, *last_dims, device=device, dtype=dtype)
-                    for _ in range(np.random.randint(num_inner_min, num_inner_max))]
+                    for _ in range(np.random.randint(num_inner_min, num_inner_max))
+                ]
             pts_list.append(pts_list_i)
         ret = fvdb.JaggedTensor(pts_list), pts_list
         self.assertTrue(ret[0].eshape == [s for s in ret[0].jdata.shape[1:]])
@@ -134,7 +147,6 @@ def test_jflatten_list_of_lists(self, device, dtype):
                 self.assertEqual(lshape1[i][j], lshape3[count])
                 count += 1
 
-
         jt4 = jt2.jflatten(dim=1)
         lshape2 = jt2.lshape
         lshape4 = jt4.lshape
@@ -184,8 +196,28 @@ def test_jflatten_list(self, device, dtype):
 
     @parameterized.expand(all_device_dtype_combos)
     def test_concatenation(self, device, dtype):
-        jt1, l1 = self.mklol(7, 2, 5, device, dtype, last_dims=(3,), base_num=1_000_000 if device == 'cuda' else 1000, vary_num=100, empty_prob=0.0)
-        jt2, _ = self.mklol(3, 3, 5, device, dtype, last_dims=(3,), base_num=1_000_000 if device == 'cuda' else 1000, vary_num=100, empty_prob=0.0)
+        jt1, l1 = self.mklol(
+            7,
+            2,
+            5,
+            device,
+            dtype,
+            last_dims=(3,),
+            base_num=1_000_000 if device == "cuda" else 1000,
+            vary_num=100,
+            empty_prob=0.0,
+        )
+        jt2, _ = self.mklol(
+            3,
+            3,
+            5,
+            device,
+            dtype,
+            last_dims=(3,),
+            base_num=1_000_000 if device == "cuda" else 1000,
+            vary_num=100,
+            empty_prob=0.0,
+        )
         jt3, l3 = self.mklol_like(l1, vary_dim_1=True, vary_dim_2=False)
         jt4, l4 = self.mklol_like(l1, vary_dim_1=False, vary_dim_2=True)
 
@@ -287,12 +319,12 @@ def test_jagged_concatenation(self, device, dtype):
                 self.assertTrue(torch.all(jtlij.jdata == ll[i][j]))
 
         # Nesting dimension mismatch
-        jt4 = fvdb.JaggedTensor([torch.randn(np.random.randint(4, 100), 4, device=device, dtype=dtype)]*7)
+        jt4 = fvdb.JaggedTensor([torch.randn(np.random.randint(4, 100), 4, device=device, dtype=dtype)] * 7)
         with self.assertRaises(ValueError):
             _ = fvdb.jcat([jt1, jt4], dim=None)
 
         # Device dimension mismatch
-        other_device = 'cpu' if device == 'cuda' else 'cuda'
+        other_device = "cpu" if device == "cuda" else "cuda"
         jt4 = jt1.to(other_device)
         with self.assertRaises(ValueError):
             _ = fvdb.jcat([jt1, jt4], dim=None)
@@ -307,14 +339,17 @@ def test_jagged_concatenation(self, device, dtype):
         with self.assertRaises(ValueError):
             _ = fvdb.jcat([], dim=None)
 
-    @parameterized.expand([
-        [*l1, *l2] for l1, l2 in itertools.product(all_device_dtype_combos, all_device_dtype_combos)
-    ])
+    @parameterized.expand(
+        [[*l1, *l2] for l1, l2 in itertools.product(all_device_dtype_combos, all_device_dtype_combos)]
+    )
     def test_jagged_like(self, from_device, from_dtype, to_device, to_dtype):
         num_grids = np.random.randint(1, 128)
-        nvox_per_grid = NVOX if from_device == 'cuda' else 100
-        nrand = 10_000 if from_device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=from_device, dtype=from_dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if from_device == "cuda" else 100
+        nrand = 10_000 if from_device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=from_device, dtype=from_dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         featdata = torch.randn(randpts.jdata.shape[0], 32, dtype=to_dtype, device=to_device)
 
@@ -323,16 +358,19 @@ def test_jagged_like(self, from_device, from_dtype, to_device, to_dtype):
         self.check_lshape(randfeats, pts_list)
         self.assertEqual(randfeats.jdata.shape[0], randpts.jdata.shape[0])
         self.assertEqual(randfeats.jdata.shape[0], randpts.jdata.shape[0])
-        self.assertEqual(randfeats.device, randpts.device) # jagged_like ignore device
+        self.assertEqual(randfeats.device, randpts.device)  # jagged_like ignore device
         self.assertEqual(randpts.dtype, from_dtype)
         self.assertEqual(randfeats.dtype, to_dtype)
 
     @parameterized.expand(all_device_dtype_combos)
     def test_rmask(self, device, dtype):
         num_grids = np.random.randint(1, 128)
-        nvox_per_grid = NVOX if device == 'cuda' else 100
-        nrand = 10_000 if device == 'cuda' else 100
-        pts_list = [torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype) for _ in range(num_grids)]
+        nvox_per_grid = NVOX if device == "cuda" else 100
+        nrand = 10_000 if device == "cuda" else 100
+        pts_list = [
+            torch.rand(nvox_per_grid + np.random.randint(nrand), 3, device=device, dtype=dtype)
+            for _ in range(num_grids)
+        ]
         randpts = fvdb.JaggedTensor(pts_list)
         self.check_lshape(randpts, pts_list)
 
@@ -340,7 +378,7 @@ def test_rmask(self, device, dtype):
         masked_randpts = randpts.rmask(mask)
         masked_list = []
         for i, pts in enumerate(pts_list):
-            maski = mask[randpts.joffsets[i]:randpts.joffsets[i+1]]
+            maski = mask[randpts.joffsets[i] : randpts.joffsets[i + 1]]
             masked_list.append(pts[maski])
             self.assertTrue(torch.all(masked_randpts[i].jdata == masked_list[-1]))
         self.check_lshape(masked_randpts, masked_list)
@@ -360,8 +398,8 @@ def test_jagged_tensor_one_element(self, device, dtype):
         grid = gridbatch[0]
 
         data_path = os.path.join(os.path.dirname(__file__), os.path.pardir, "data")
-        ray_o_path = os.path.join(data_path, 'ray_orig.pt')
-        ray_d_path = os.path.join(data_path, 'ray_dir.pt')
+        ray_o_path = os.path.join(data_path, "ray_orig.pt")
+        ray_d_path = os.path.join(data_path, "ray_dir.pt")
         ray_o = torch.load(ray_o_path).to(device=device, dtype=dtype)
         ray_d = torch.load(ray_d_path).to(device=device, dtype=dtype)
         ray_orig = fvdb.JaggedTensor([ray_o])
@@ -487,7 +525,6 @@ def test_arithmetic_operators(self, device, dtype):
             res = randpts_b + randpts_c
         fvdb.config.pedantic_error_checking = False
 
-
         # ------------
         # Subtract
         # ------------
@@ -507,7 +544,6 @@ def test_arithmetic_operators(self, device, dtype):
             res = randpts_b - randpts_c
         fvdb.config.pedantic_error_checking = False
 
-
         # ------------
         # Multiply
         # ------------
@@ -549,20 +585,20 @@ def test_arithmetic_operators(self, device, dtype):
         # ------------
         # Pow
         # ------------
-        res = randpts ** randpts_b
-        self.assertTrue(torch.allclose(res.jdata, randpts.jdata ** randpts_b.jdata))
+        res = randpts**randpts_b
+        self.assertTrue(torch.allclose(res.jdata, randpts.jdata**randpts_b.jdata))
         self.check_lshape(res, pts_list)
 
-        res2 = randpts_b ** randpts_c
-        self.assertTrue(torch.allclose(res2.jdata, randpts_b.jdata ** randpts_c.jdata))
+        res2 = randpts_b**randpts_c
+        self.assertTrue(torch.allclose(res2.jdata, randpts_b.jdata**randpts_c.jdata))
         self.check_lshape(res2, pts_list)
         fvdb.config.pedantic_error_checking = True
         with self.assertRaises(ValueError):
-            res = randpts_b ** randpts_c
+            res = randpts_b**randpts_c
         fvdb.config.pedantic_error_checking = False
 
-        res = randpts ** 5
-        self.assertTrue(torch.allclose(res.jdata, randpts.jdata ** 5))
+        res = randpts**5
+        self.assertTrue(torch.allclose(res.jdata, randpts.jdata**5))
         self.check_lshape(res, pts_list)
 
         # ------------
@@ -592,7 +628,7 @@ def test_arithmetic_operators(self, device, dtype):
         self.check_lshape(res, pts_list)
 
         res2 = randpts_b % randpts_c
-        if dtype != torch.float16: # Not stable in float 16, but not important for this test
+        if dtype != torch.float16:  # Not stable in float 16, but not important for this test
             self.assertTrue(torch.allclose(res2.jdata, randpts_b.jdata % randpts_c.jdata))
             self.check_lshape(res2, pts_list)
             fvdb.config.pedantic_error_checking = True
@@ -748,7 +784,10 @@ def test_jreshape(self, device, dtype):
         for _ in range(4):
             pts_list_i = []
             while len(pts_list_i) == 0:
-                pts_list_i = [torch.rand(1000 + np.random.randint(10), 3, device=device, dtype=dtype) for _ in range(np.random.randint(3, 7))]
+                pts_list_i = [
+                    torch.rand(1000 + np.random.randint(10), 3, device=device, dtype=dtype)
+                    for _ in range(np.random.randint(3, 7))
+                ]
             pts_list_a.append(pts_list_i)
         pts_list_b = [[torch.rand_like(x) + 1e-5 for x in pts_list_i] for pts_list_i in pts_list_a]
         randpts_a = fvdb.JaggedTensor(pts_list_a)
@@ -778,7 +817,10 @@ def test_arithmetic_operators_list_of_lists(self, device, dtype):
         for _ in range(4):
             pts_list_i = []
             while len(pts_list_i) == 0:
-                pts_list_i = [torch.rand(1000 + np.random.randint(10), 3, device=device, dtype=dtype) for _ in range(np.random.randint(3, 7))]
+                pts_list_i = [
+                    torch.rand(1000 + np.random.randint(10), 3, device=device, dtype=dtype)
+                    for _ in range(np.random.randint(3, 7))
+                ]
             pts_list.append(pts_list_i)
         pts_list_b = [[torch.rand_like(x) + 1e-5 for x in pts_list_i] for pts_list_i in pts_list]
         randpts = fvdb.JaggedTensor(pts_list)
@@ -847,7 +889,7 @@ def test_arithmetic_operators_list_of_lists(self, device, dtype):
         self.assertTrue(torch.allclose(res.jdata, randpts.jdata / randpts_b.jdata))
         self.check_lshape(res, pts_list)
 
-        if (dtype != torch.float16): # Not stable in float 16, but not important for this test
+        if dtype != torch.float16:  # Not stable in float 16, but not important for this test
             res2 = randpts / randpts_c
             self.assertTrue(torch.allclose(res2.jdata, randpts.jdata / randpts_c.jdata))
             self.check_lshape(res2, pts_list)
@@ -864,7 +906,7 @@ def test_arithmetic_operators_list_of_lists(self, device, dtype):
         self.assertTrue(torch.allclose(res.jdata, randpts.jdata // randpts_b.jdata))
         self.check_lshape(res, pts_list)
 
-        if (dtype != torch.float16): # Not stable in float 16, but not important for this test
+        if dtype != torch.float16:  # Not stable in float 16, but not important for this test
             res2 = randpts // randpts_c
             self.assertTrue(torch.allclose(res2.jdata, randpts.jdata // randpts_c.jdata))
             self.check_lshape(res2, pts_list)
@@ -959,93 +1001,57 @@ def test_arithmetic_operators_list_of_lists(self, device, dtype):
 
         for i, rpi in enumerate(randpts):
             for j, _ in enumerate(rpi):
-                self.assertTrue(torch.allclose((randpts + randpts_b)[i][j].jdata,
-                                                pts_list[i][j] + pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts + 2)[i][j].jdata,
-                                                pts_list[i][j] + 2))
-                self.assertTrue(torch.allclose((randpts + 3.14)[i][j].jdata,
-                                                pts_list[i][j] + 3.14))
-
-                self.assertTrue(torch.allclose((randpts - randpts_b)[i][j].jdata,
-                                                pts_list[i][j] - pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts - 2)[i][j].jdata,
-                                                pts_list[i][j] - 2))
-                self.assertTrue(torch.allclose((randpts - 3.14)[i][j].jdata,
-                                                pts_list[i][j] - 3.14))
-
-                self.assertTrue(torch.allclose((randpts * randpts_b)[i][j].jdata,
-                                                pts_list[i][j] * pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts * 2)[i][j].jdata,
-                                                pts_list[i][j] * 2))
-                self.assertTrue(torch.allclose((randpts * 3.14)[i][j].jdata,
-                                                pts_list[i][j] * 3.14))
-
-                self.assertTrue(torch.allclose((randpts / randpts_b)[i][j].jdata,
-                                                pts_list[i][j] / pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts / 2)[i][j].jdata,
-                                                pts_list[i][j] / 2))
-                self.assertTrue(torch.allclose((randpts / 3.14)[i][j].jdata,
-                                                pts_list[i][j] / 3.14))
-
-                self.assertTrue(torch.allclose((randpts // randpts_b)[i][j].jdata,
-                                                pts_list[i][j] // pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts // 2)[i][j].jdata,
-                                                pts_list[i][j] // 2))
-                self.assertTrue(torch.allclose((randpts // 3.14)[i][j].jdata,
-                                                pts_list[i][j] // 3.14))
-
-                self.assertTrue(torch.allclose((randpts % randpts_b)[i][j].jdata,
-                                                pts_list[i][j] % pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts % 2)[i][j].jdata,
-                                                pts_list[i][j] % 2))
-                self.assertTrue(torch.allclose((randpts % 3.14)[i][j].jdata,
-                                                pts_list[i][j] % 3.14))
-
-                self.assertTrue(torch.allclose((randpts > randpts_b)[i][j].jdata,
-                                                pts_list[i][j] > pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts > 2)[i][j].jdata,
-                                                pts_list[i][j] > 2))
-                self.assertTrue(torch.allclose((randpts > 3.14)[i][j].jdata,
-                                                pts_list[i][j] > 3.14))
-
-                self.assertTrue(torch.allclose((randpts >= randpts_b)[i][j].jdata,
-                                                pts_list[i][j] >= pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts >= 2)[i][j].jdata,
-                                                pts_list[i][j] >= 2))
-                self.assertTrue(torch.allclose((randpts >= 3.14)[i][j].jdata,
-                                                pts_list[i][j] >= 3.14))
-
-                self.assertTrue(torch.allclose((randpts < randpts_b)[i][j].jdata,
-                                                pts_list[i][j] < pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts < 2)[i][j].jdata,
-                                                pts_list[i][j] < 2))
-                self.assertTrue(torch.allclose((randpts < 3.14)[i][j].jdata,
-                                                pts_list[i][j] < 3.14))
-
-                self.assertTrue(torch.allclose((randpts <= randpts_b)[i][j].jdata,
-                                                pts_list[i][j] <= pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts <= 2)[i][j].jdata,
-                                                pts_list[i][j] <= 2))
-                self.assertTrue(torch.allclose((randpts <= 3.14)[i][j].jdata,
-                                                pts_list[i][j] <= 3.14))
-
-                self.assertTrue(torch.allclose((randpts == randpts_b)[i][j].jdata,
-                                                pts_list[i][j] == pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts == 2)[i][j].jdata,
-                                                pts_list[i][j] == 2))
-                self.assertTrue(torch.allclose((randpts == 3.14)[i][j].jdata,
-                                                pts_list[i][j] == 3.14))
-
-                self.assertTrue(torch.allclose((randpts != randpts_b)[i][j].jdata,
-                                                pts_list[i][j] != pts_list_b[i][j]))
-                self.assertTrue(torch.allclose((randpts != 2)[i][j].jdata,
-                                                pts_list[i][j] != 2))
-                self.assertTrue(torch.allclose((randpts != 3.14)[i][j].jdata,
-                                                pts_list[i][j] != 3.14))
+                self.assertTrue(torch.allclose((randpts + randpts_b)[i][j].jdata, pts_list[i][j] + pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts + 2)[i][j].jdata, pts_list[i][j] + 2))
+                self.assertTrue(torch.allclose((randpts + 3.14)[i][j].jdata, pts_list[i][j] + 3.14))
+
+                self.assertTrue(torch.allclose((randpts - randpts_b)[i][j].jdata, pts_list[i][j] - pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts - 2)[i][j].jdata, pts_list[i][j] - 2))
+                self.assertTrue(torch.allclose((randpts - 3.14)[i][j].jdata, pts_list[i][j] - 3.14))
+
+                self.assertTrue(torch.allclose((randpts * randpts_b)[i][j].jdata, pts_list[i][j] * pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts * 2)[i][j].jdata, pts_list[i][j] * 2))
+                self.assertTrue(torch.allclose((randpts * 3.14)[i][j].jdata, pts_list[i][j] * 3.14))
+
+                self.assertTrue(torch.allclose((randpts / randpts_b)[i][j].jdata, pts_list[i][j] / pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts / 2)[i][j].jdata, pts_list[i][j] / 2))
+                self.assertTrue(torch.allclose((randpts / 3.14)[i][j].jdata, pts_list[i][j] / 3.14))
+
+                self.assertTrue(torch.allclose((randpts // randpts_b)[i][j].jdata, pts_list[i][j] // pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts // 2)[i][j].jdata, pts_list[i][j] // 2))
+                self.assertTrue(torch.allclose((randpts // 3.14)[i][j].jdata, pts_list[i][j] // 3.14))
+
+                self.assertTrue(torch.allclose((randpts % randpts_b)[i][j].jdata, pts_list[i][j] % pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts % 2)[i][j].jdata, pts_list[i][j] % 2))
+                self.assertTrue(torch.allclose((randpts % 3.14)[i][j].jdata, pts_list[i][j] % 3.14))
+
+                self.assertTrue(torch.allclose((randpts > randpts_b)[i][j].jdata, pts_list[i][j] > pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts > 2)[i][j].jdata, pts_list[i][j] > 2))
+                self.assertTrue(torch.allclose((randpts > 3.14)[i][j].jdata, pts_list[i][j] > 3.14))
+
+                self.assertTrue(torch.allclose((randpts >= randpts_b)[i][j].jdata, pts_list[i][j] >= pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts >= 2)[i][j].jdata, pts_list[i][j] >= 2))
+                self.assertTrue(torch.allclose((randpts >= 3.14)[i][j].jdata, pts_list[i][j] >= 3.14))
+
+                self.assertTrue(torch.allclose((randpts < randpts_b)[i][j].jdata, pts_list[i][j] < pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts < 2)[i][j].jdata, pts_list[i][j] < 2))
+                self.assertTrue(torch.allclose((randpts < 3.14)[i][j].jdata, pts_list[i][j] < 3.14))
+
+                self.assertTrue(torch.allclose((randpts <= randpts_b)[i][j].jdata, pts_list[i][j] <= pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts <= 2)[i][j].jdata, pts_list[i][j] <= 2))
+                self.assertTrue(torch.allclose((randpts <= 3.14)[i][j].jdata, pts_list[i][j] <= 3.14))
+
+                self.assertTrue(torch.allclose((randpts == randpts_b)[i][j].jdata, pts_list[i][j] == pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts == 2)[i][j].jdata, pts_list[i][j] == 2))
+                self.assertTrue(torch.allclose((randpts == 3.14)[i][j].jdata, pts_list[i][j] == 3.14))
+
+                self.assertTrue(torch.allclose((randpts != randpts_b)[i][j].jdata, pts_list[i][j] != pts_list_b[i][j]))
+                self.assertTrue(torch.allclose((randpts != 2)[i][j].jdata, pts_list[i][j] != 2))
+                self.assertTrue(torch.allclose((randpts != 3.14)[i][j].jdata, pts_list[i][j] != 3.14))
 
     @parameterized.expand(all_device_dtype_combos)
     def test_to_devices(self, device, dtype):
-        create_device = 'cpu' if device == 'cuda' else 'cuda'
+        create_device = "cpu" if device == "cuda" else "cuda"
         pts_list = [torch.rand(1000 + np.random.randint(10), 3, device=create_device, dtype=dtype) for _ in range(17)]
         randpts = fvdb.JaggedTensor(pts_list)
         self.check_lshape(randpts, pts_list)
@@ -1089,7 +1095,7 @@ def test_jsum(self, device, dtype):
             self.check_lshape(sum_res, sum_list)
 
             grad_out = torch.rand_like(sum_res_ours)
-            #(sum_res_ours * grad_out).sum().backward()
+            # (sum_res_ours * grad_out).sum().backward()
             sum_res_ours.backward(grad_out)
             assert jt.jdata.grad is not None
             grad_ours = jt.jdata.grad.clone()
@@ -1099,18 +1105,18 @@ def test_jsum(self, device, dtype):
                 sum_res_ptscatter = torch_scatter.scatter_sum(jt.jdata, jt.jidx.long(), dim=0, dim_size=len(jt))
             else:
                 sum_res_ptscatter = jt.jdata.sum(dim=dim, keepdim=keepdim)
-            #(sum_res_ptscatter * grad_out).sum().backward()
+            # (sum_res_ptscatter * grad_out).sum().backward()
             sum_res_ptscatter.backward(grad_out)
             assert jt.jdata.grad is not None
             grad_ptscatter = jt.jdata.grad.clone()
 
             tol = {}
             if dtype == torch.float16:
-                tol['rtol'] = 1e-1
-                tol['atol'] = 1e-1
+                tol["rtol"] = 1e-1
+                tol["atol"] = 1e-1
             elif dtype == torch.float32:
-                tol['rtol'] = 1e-4
-                tol['atol'] = 1e-5
+                tol["rtol"] = 1e-4
+                tol["atol"] = 1e-5
             _ = torch.where(~torch.isclose(sum_res_ours, sum_res_ptscatter, **tol))
             self.assertTrue(torch.allclose(sum_res_ours, sum_res_ptscatter, **tol))
             self.assertTrue(torch.allclose(grad_ours, grad_ptscatter, **tol))
@@ -1277,7 +1283,10 @@ def test_jmin_list_of_lists(self, device, dtype):
                                 self.assertEqual(jt[i][j].jdata[idx_i, a, b], data_list[i][j][idx_ours, a, b])
                                 self.assertEqual(jt[i][j].jdata[idx_ours, a, b], data_list[i][j][idx_i, a, b])
                 else:
-                    self.assertTrue(torch.all(min_idx_ours[i][j].jdata == min_idx_i).item(), str(min_idx_ours[i][j].jdata) + " vs " + str(min_idx_i))
+                    self.assertTrue(
+                        torch.all(min_idx_ours[i][j].jdata == min_idx_i).item(),
+                        str(min_idx_ours[i][j].jdata) + " vs " + str(min_idx_i),
+                    )
         catted_min_idx = torch.stack(catted_min_idx, dim=0)
 
         min_res_jdata = min_val_ours.jdata
@@ -1297,8 +1306,10 @@ def test_jmin_list_of_lists(self, device, dtype):
             zgours = torch.sort(grad_ours[grad_ours != 0.0])[0]
             zgcmp = torch.sort(grad_ptscatter[grad_ptscatter != 0.0])[0]
             self.assertTrue(torch.allclose(zgours, zgcmp))
-            self.assertTrue(torch.allclose(grad_ours, grad_ptscatter),
-                            str((grad_ours[grad_ours != 0] - grad_ptscatter[grad_ptscatter != 0]).max()))
+            self.assertTrue(
+                torch.allclose(grad_ours, grad_ptscatter),
+                str((grad_ours[grad_ours != 0] - grad_ptscatter[grad_ptscatter != 0]).max()),
+            )
         else:
             zgours = torch.sort(grad_ours[grad_ours != 0.0])[0]
             zgcmp = torch.sort(grad_ptscatter[grad_ptscatter != 0.0])[0]
@@ -1338,7 +1349,7 @@ def test_jmax_list_of_lists(self, device, dtype):
                     # values in the tensor.
                     for a in range(max_idx_i.shape[0]):
                         for b in range(max_idx_i.shape[1]):
-                            if (max_idx_ours[i][j].jdata[0][a][b] != max_idx_i[a][b]):
+                            if max_idx_ours[i][j].jdata[0][a][b] != max_idx_i[a][b]:
                                 idx_ours = max_idx_ours[i][j].jdata[0][a][b].item()
                                 idx_i = max_idx_i[a][b].item()
                                 self.assertEqual(jt[i][j].jdata[idx_i, a, b], data_list[i][j][idx_i, a, b])
@@ -1346,7 +1357,10 @@ def test_jmax_list_of_lists(self, device, dtype):
                                 self.assertEqual(jt[i][j].jdata[idx_i, a, b], data_list[i][j][idx_ours, a, b])
                                 self.assertEqual(jt[i][j].jdata[idx_ours, a, b], data_list[i][j][idx_i, a, b])
                 else:
-                    self.assertTrue(torch.all(max_idx_ours[i][j].jdata == max_idx_i).item(), str(max_idx_ours[i][j].jdata) + " vs " + str(max_idx_i))
+                    self.assertTrue(
+                        torch.all(max_idx_ours[i][j].jdata == max_idx_i).item(),
+                        str(max_idx_ours[i][j].jdata) + " vs " + str(max_idx_i),
+                    )
 
         max_res_jdata = max_val_ours.jdata
         grad_out = torch.rand_like(max_res_jdata)
@@ -1372,11 +1386,11 @@ def test_jmax_list_of_lists(self, device, dtype):
     def test_jsum_list_of_lists(self, device, dtype):
         tol = {}
         if dtype == torch.float16:
-            tol['rtol'] = 1e-1
-            tol['atol'] = 1e-1
+            tol["rtol"] = 1e-1
+            tol["atol"] = 1e-1
         elif dtype == torch.float32:
-            tol['rtol'] = 1e-3
-            tol['atol'] = 1e-4
+            tol["rtol"] = 1e-3
+            tol["atol"] = 1e-4
 
         if dtype == torch.float16:
             min_num = 100
@@ -1399,19 +1413,21 @@ def test_jsum_list_of_lists(self, device, dtype):
         self.check_lshape(sum_ours, sum_list)
         for i, soi in enumerate(sum_ours):
             for j, soij in enumerate(soi):
-                self.assertTrue(torch.allclose(soij.jdata, data_list[i][j].sum(0), **tol),
-                                str(soij.jdata) + " vs " + str(data_list[i][j].sum(0)))
+                self.assertTrue(
+                    torch.allclose(soij.jdata, data_list[i][j].sum(0), **tol),
+                    str(soij.jdata) + " vs " + str(data_list[i][j].sum(0)),
+                )
 
         sum_ours_jdata = jt.jsum().jdata
         grad_out = torch.rand_like(sum_ours_jdata)
-        #(sum_ours_jdata * grad_out).sum().backward()
+        # (sum_ours_jdata * grad_out).sum().backward()
         sum_ours_jdata.backward(grad_out)
         assert jt.jdata.grad is not None
         grad_ours = jt.jdata.grad.clone()
 
         jt.jdata.grad = None
         sum_res_ptscatter = torch_scatter.scatter_sum(jt.jdata, jt.jidx.long(), dim=0, dim_size=jt.num_tensors)
-        #(sum_res_ptscatter * grad_out).sum().backward()
+        # (sum_res_ptscatter * grad_out).sum().backward()
         sum_res_ptscatter.backward(grad_out)
         assert jt.jdata.grad is not None
         grad_ptscatter = jt.jdata.grad.clone()
@@ -1463,15 +1479,13 @@ def test_sdpa(self, dtype):
 
         # fVDB
         with torch.backends.cuda.sdp_kernel(enable_flash=False, enable_math=True, enable_mem_efficient=False):
-            out_jagged_fvdb = fvdb.scaled_dot_product_attention(
-                q_jagged, k_jagged, v_jagged, scale=scale
-            )
+            out_jagged_fvdb = fvdb.scaled_dot_product_attention(q_jagged, k_jagged, v_jagged, scale=scale)
             self.check_lshape(out_jagged_fvdb, out_jagged_torch_forloop_list)
         self.assertTrue(torch.allclose(out_jagged_torch_forloop.jdata, out_jagged_fvdb.jdata))
         self.assertTrue(torch.all(out_jagged_torch_forloop.joffsets == out_jagged_fvdb.joffsets))
 
     def test_datatype_caster(self):
-        feature = torch.randn((120, 32), device='cpu')
+        feature = torch.randn((120, 32), device="cpu")
         jagged_feature = fvdb.JaggedTensor([feature])
         self.check_lshape(jagged_feature, [feature])
 
@@ -1502,7 +1516,10 @@ def test_unbind(self):
                 self.assertTrue(torch.all(ltij == lt2[i][j]).item())
 
     def test_list_of_lists_indexing(self):
-        lt = [[torch.randn(np.random.randint(100, 200), 7) for _ in range(int(l.item()))] for l in torch.randint(3, 17, (7,))]
+        lt = [
+            [torch.randn(np.random.randint(100, 200), 7) for _ in range(int(l.item()))]
+            for l in torch.randint(3, 17, (7,))
+        ]
         jt = fvdb.JaggedTensor(lt)
         self.check_lshape(jt, lt)
         lt2 = jt.unbind()
@@ -1515,7 +1532,10 @@ def test_list_of_lists_indexing(self):
                 self.assertTrue(torch.all(jt[i][j].jdata == lt2[i][j]).item())
 
     def test_list_of_lists_slicing(self):
-        lt = [[torch.randn(np.random.randint(100, 200), 7) for _ in range(int(l.item()))] for l in torch.randint(3, 17, (7,))]
+        lt = [
+            [torch.randn(np.random.randint(100, 200), 7) for _ in range(int(l.item()))]
+            for l in torch.randint(3, 17, (7,))
+        ]
         jt = fvdb.JaggedTensor(lt)
         self.check_lshape(jt, lt)
 
@@ -1616,7 +1636,7 @@ def test_list_of_lists_slicing(self):
             for j, lij in enumerate(li):
                 self.assertTrue(torch.all(lij == jt2[i][j].jdata).item())
 
-    @parameterized.expand(['cuda', 'cpu'])
+    @parameterized.expand(["cuda", "cpu"])
     def test_jagged_tensor_integer_indexing(self, device):
         jt1, l1 = self.mklol(7, 4, 8, device, torch.float32)
         self.check_lshape(jt1, l1)
@@ -1643,7 +1663,7 @@ def test_jagged_tensor_integer_indexing(self, device):
         for jti in jt1:
             pli = []
             for jtij in jti:
-                pmtij = torch.randperm(jtij.rshape[0]).to(device)[:np.random.randint(1, 5)]
+                pmtij = torch.randperm(jtij.rshape[0]).to(device)[: np.random.randint(1, 5)]
                 pli.append(pmtij)
             permlist.append(pli)
         permjt = fvdb.JaggedTensor(permlist)
@@ -1659,7 +1679,7 @@ def test_jagged_tensor_integer_indexing(self, device):
         for jti in jt1:
             pli = []
             for jtij in jti:
-                pmtij = torch.randperm(jtij.rshape[0]).to(device)[:jtij.rshape[0]//2]
+                pmtij = torch.randperm(jtij.rshape[0]).to(device)[: jtij.rshape[0] // 2]
                 pmtij = torch.cat([pmtij, pmtij, pmtij, pmtij])
                 pli.append(pmtij)
             permlist.append(pli)
@@ -1689,8 +1709,7 @@ def test_jagged_tensor_integer_indexing(self, device):
         with self.assertRaises(IndexError):
             _ = jt1[randintegers]
 
-
-    @parameterized.expand(['cuda', 'cpu'])
+    @parameterized.expand(["cuda", "cpu"])
     def test_jagged_tensor_integer_indexing_multidim(self, device):
         jt1, _ = self.mklol(7, 4, 8, device, torch.float32)
 
@@ -1723,7 +1742,7 @@ def test_jagged_tensor_integer_indexing_multidim(self, device):
             pli = []
             pdi = []
             for jtij in jti:
-                pmtij = torch.randperm(jtij.rshape[0]).to(device)[:np.random.randint(1, 5)]
+                pmtij = torch.randperm(jtij.rshape[0]).to(device)[: np.random.randint(1, 5)]
                 pmtij = torch.stack([pmtij, pmtij], dim=-1)
                 pli.append(pmtij)
                 pdi.append(jtij.jdata[pmtij])
@@ -1744,7 +1763,7 @@ def test_jagged_tensor_integer_indexing_multidim(self, device):
             pli = []
             pdi = []
             for jtij in jti:
-                pmtij = torch.randperm(jtij.rshape[0]).to(device)[:jtij.rshape[0]//2]
+                pmtij = torch.randperm(jtij.rshape[0]).to(device)[: jtij.rshape[0] // 2]
                 pmtij = torch.cat([pmtij, pmtij, pmtij, pmtij])
                 pmtij = torch.stack([pmtij, pmtij], dim=-1)
                 pli.append(pmtij)
@@ -1766,9 +1785,9 @@ def test_jagged_tensor_integer_indexing_multidim(self, device):
             pli = []
             pdi = []
             for jtij in jti:
-                pmtij = torch.randperm(jtij.rshape[0]).to(device) # [N]
-                pmtij = torch.stack([pmtij, pmtij], dim=-1) # [N, 2]
-                pmtij = pmtij.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 3, 4) # [N, 2, 3, 4]
+                pmtij = torch.randperm(jtij.rshape[0]).to(device)  # [N]
+                pmtij = torch.stack([pmtij, pmtij], dim=-1)  # [N, 2]
+                pmtij = pmtij.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 3, 4)  # [N, 2, 3, 4]
                 pli.append(pmtij)
                 pdi.append(jtij.jdata[pmtij])
             permlist.append(pli)
@@ -1781,8 +1800,7 @@ def test_jagged_tensor_integer_indexing_multidim(self, device):
             for j, jtpij in enumerate(jtpi):
                 self.assertTrue(torch.all(jt1[i][j].jdata[permjt[i][j].jdata] == jtpij.jdata).item())
 
-
-    @parameterized.expand(['cuda', 'cpu'])
+    @parameterized.expand(["cuda", "cpu"])
     def test_jagged_tensor_boolean_indexing(self, device):
         jt1, l1 = self.mklol(7, 4, 8, device, torch.float32)
         self.check_lshape(jt1, l1)
@@ -1809,7 +1827,7 @@ def test_jagged_tensor_boolean_indexing(self, device):
                 self.assertTrue(torch.all(jt1[i][j].jdata[permjt[i][j].jdata] == permdata[i][j]).item())
 
     def test_edim(self):
-        jt, lt = self.mklol(7, 4, 8, 'cuda', torch.float32)
+        jt, lt = self.mklol(7, 4, 8, "cuda", torch.float32)
         self.check_lshape(jt, lt)
         self.assertEqual(jt.edim, 2)
         self.assertEqual(jt.eshape[0], 3)
@@ -2074,7 +2092,7 @@ def test_assignment(self, device, dtype):
         self.assertTrue(torch.all(jt1.jdata == jt2.jdata).item())
         self.assertTrue(not torch.all(jt1.jdata == jt3.jdata).item())
 
-        jt1 **= jt3.jdata.abs() # Because NaN
+        jt1 **= jt3.jdata.abs()  # Because NaN
         self.assertTrue(torch.all(jt1.jdata == jt2.jdata).item())
         self.assertTrue(not torch.all(jt1.jdata == jt3.jdata).item())
 
@@ -2150,5 +2168,5 @@ def test_round(self, device, dtype):
     #         self.assertTrue(torch.all(data_sorted == jt[i].jdata[idx[i].jdata]).item())
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_mutable_grids.py b/fvdb/tests/unit/test_mutable_grids.py
index 2dc4005383..ecf6493bf6 100644
--- a/fvdb/tests/unit/test_mutable_grids.py
+++ b/fvdb/tests/unit/test_mutable_grids.py
@@ -9,16 +9,16 @@
 from fvdb import GridBatch
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16],
-    ['cpu', torch.float32],
-    ['cuda', torch.float32],
-    ['cpu', torch.float64],
-    ['cuda', torch.float64]
+    ["cuda", torch.float16],
+    ["cpu", torch.float32],
+    ["cuda", torch.float32],
+    ["cpu", torch.float64],
+    ["cuda", torch.float64],
 ]
 
 all_device_combos = [
-    ['cpu'],
-    ['cuda'],
+    ["cpu"],
+    ["cuda"],
 ]
 
 
@@ -28,10 +28,10 @@ def test_active_grid_coords_is_the_same_as_immutable_if_all_enabled(self, device
         p = torch.rand(7777, 3).to(dtype).to(device)
 
         grid1 = GridBatch(mutable=False, device=device)
-        grid1.set_from_points(p, [-1]*3, [1]*3, 0.025, torch.zeros(3))
+        grid1.set_from_points(p, [-1] * 3, [1] * 3, 0.025, torch.zeros(3))
 
         grid2 = GridBatch(mutable=True, device=device)
-        grid2.set_from_points(p, [-1]*3, [1]*3, 0.025, torch.zeros(3))
+        grid2.set_from_points(p, [-1] * 3, [1] * 3, 0.025, torch.zeros(3))
 
         self.assertTrue(torch.all(grid1.ijk.jdata == grid2.ijk.jdata))
         self.assertEqual(grid1.total_voxels, grid2.total_voxels)
@@ -43,10 +43,8 @@ def test_active_grid_coords_is_the_same_as_immutable_if_all_enabled(self, device
 
         grid2.disable_ijk(drop_ijk)
 
-        self.assertNotEqual(grid1.ijk_enabled.jdata.shape[0],
-                            grid2.ijk_enabled.jdata.shape[0])
-        self.assertEqual(grid1.ijk.jdata.shape[0],
-                         grid2.ijk.jdata.shape[0])
+        self.assertNotEqual(grid1.ijk_enabled.jdata.shape[0], grid2.ijk_enabled.jdata.shape[0])
+        self.assertEqual(grid1.ijk.jdata.shape[0], grid2.ijk.jdata.shape[0])
         self.assertEqual(grid1.total_voxels, grid2.total_voxels)
         self.assertNotEqual(grid1.total_enabled_voxels, grid2.total_enabled_voxels)
         self.assertNotEqual(grid1.total_voxels, grid2.total_enabled_voxels)
@@ -65,7 +63,7 @@ def test_active_grid_coords_before_and_after_masking(self, device, dtype):
         p = torch.rand(7777, 3).to(dtype).to(device)
 
         grid_static = GridBatch(mutable=False, device=device)
-        grid_static.set_from_points(p, [-1]*3, [1]*3, 0.025, torch.zeros(3))
+        grid_static.set_from_points(p, [-1] * 3, [1] * 3, 0.025, torch.zeros(3))
 
         p_ijk = grid_static.ijk.jdata
 
@@ -102,7 +100,7 @@ def test_mask_and_unmask(self, device, dtype):
         p = torch.rand(7777, 3).to(dtype).to(device)
 
         grid_static = GridBatch(mutable=False, device=device)
-        grid_static.set_from_points(p, [-1]*3, [1]*3, voxel_sizes=0.025, origins=torch.zeros(3))
+        grid_static.set_from_points(p, [-1] * 3, [1] * 3, voxel_sizes=0.025, origins=torch.zeros(3))
 
         p_ijk = grid_static.ijk.jdata
 
@@ -134,5 +132,5 @@ def test_mask_and_unmask(self, device, dtype):
         self.assertEqual(grid_static.total_voxels, grid_dynamic.total_enabled_voxels)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_nkfw_api.py b/fvdb/tests/unit/test_nkfw_api.py
index 4fca66afc6..801041ca93 100644
--- a/fvdb/tests/unit/test_nkfw_api.py
+++ b/fvdb/tests/unit/test_nkfw_api.py
@@ -47,9 +47,8 @@ def _get_range_kernel(self, n_range):
         assert n_range % 2 == 1, "target_range must be odd."
         if self.cached_kernels is None:
             data = np.load(os.path.join(os.path.dirname(__file__), "nkfw_api", "kernel.npz"))
-            self.cached_kernels = data['kernel']
-        kernel = torch.tensor(self.cached_kernels.copy()[:n_range ** 3],
-                              dtype=torch.int, device=self.device)
+            self.cached_kernels = data["kernel"]
+        kernel = torch.tensor(self.cached_kernels.copy()[: n_range**3], dtype=torch.int, device=self.device)
         return kernel
 
     def test_meta_ops(self):
@@ -57,13 +56,8 @@ def test_meta_ops(self):
         self.assertEqual(index.depth, 4)
         self.assertEqual(index.get_stride(3), 8)
 
-        level_0_ijk = torch.tensor([
-            [0, 0, 0], [0, 0, 1],
-            [1, 0, 0], [1, 0, 1]
-        ], dtype=torch.int, device=self.device)
-        level_1_ijk = torch.tensor([
-            [0, 2, 0], [0, 0, 0]
-        ], dtype=torch.int, device=self.device)
+        level_0_ijk = torch.tensor([[0, 0, 0], [0, 0, 1], [1, 0, 0], [1, 0, 1]], dtype=torch.int, device=self.device)
+        level_1_ijk = torch.tensor([[0, 2, 0], [0, 0, 0]], dtype=torch.int, device=self.device)
 
         level_0_ijk, _ = index.update_coords(0, level_0_ijk)
         level_1_ijk, _ = index.update_coords(1, level_1_ijk)
@@ -78,19 +72,66 @@ def test_meta_ops(self):
         self.assertEqual(index.voxel_size[2], 0.1)
 
         assert_tensor_set_equal(index.get_coords(0), level_0_ijk)
-        assert_tensor_set_equal(index.get_coords(0, expand=3), torch.tensor([
-            [-1, -1, -1], [-1, -1, 0], [-1, -1, 1], [-1, -1, 2], [-1, 0, -1], [-1, 0, 0],
-            [-1, 0, 1], [-1, 0, 2], [-1, 1, -1], [-1, 1, 0], [-1, 1, 1], [-1, 1, 2],
-            [0, -1, -1], [0, -1, 0], [0, -1, 1], [0, -1, 2], [0, 0, -1], [0, 0, 0],
-            [0, 0, 1], [0, 0, 2], [0, 1, -1], [0, 1, 0], [0, 1, 1], [0, 1, 2], [1, -1, -1],
-            [1, -1, 0], [1, -1, 1], [1, -1, 2], [1, 0, -1], [1, 0, 0], [1, 0, 1],
-            [1, 0, 2], [1, 1, -1], [1, 1, 0], [1, 1, 1], [1, 1, 2], [2, -1, -1],
-            [2, -1, 0], [2, -1, 1], [2, -1, 2], [2, 0, -1], [2, 0, 0], [2, 0, 1],
-            [2, 0, 2], [2, 1, -1], [2, 1, 0], [2, 1, 2], [2, 1, 1],
-        ], dtype=torch.int, device=self.device))
-        assert_tensor_set_equal(index.get_voxel_centers(1), torch.tensor([
-            [0.1, 0.3, 0.1], [0.1, 0.1, 0.1]
-        ], device=self.device))
+        assert_tensor_set_equal(
+            index.get_coords(0, expand=3),
+            torch.tensor(
+                [
+                    [-1, -1, -1],
+                    [-1, -1, 0],
+                    [-1, -1, 1],
+                    [-1, -1, 2],
+                    [-1, 0, -1],
+                    [-1, 0, 0],
+                    [-1, 0, 1],
+                    [-1, 0, 2],
+                    [-1, 1, -1],
+                    [-1, 1, 0],
+                    [-1, 1, 1],
+                    [-1, 1, 2],
+                    [0, -1, -1],
+                    [0, -1, 0],
+                    [0, -1, 1],
+                    [0, -1, 2],
+                    [0, 0, -1],
+                    [0, 0, 0],
+                    [0, 0, 1],
+                    [0, 0, 2],
+                    [0, 1, -1],
+                    [0, 1, 0],
+                    [0, 1, 1],
+                    [0, 1, 2],
+                    [1, -1, -1],
+                    [1, -1, 0],
+                    [1, -1, 1],
+                    [1, -1, 2],
+                    [1, 0, -1],
+                    [1, 0, 0],
+                    [1, 0, 1],
+                    [1, 0, 2],
+                    [1, 1, -1],
+                    [1, 1, 0],
+                    [1, 1, 1],
+                    [1, 1, 2],
+                    [2, -1, -1],
+                    [2, -1, 0],
+                    [2, -1, 1],
+                    [2, -1, 2],
+                    [2, 0, -1],
+                    [2, 0, 0],
+                    [2, 0, 1],
+                    [2, 0, 2],
+                    [2, 1, -1],
+                    [2, 1, 0],
+                    [2, 1, 2],
+                    [2, 1, 1],
+                ],
+                dtype=torch.int,
+                device=self.device,
+            ),
+        )
+        assert_tensor_set_equal(
+            index.get_voxel_centers(1), torch.tensor([[0.1, 0.3, 0.1], [0.1, 0.1, 0.1]], device=self.device)
+        )
 
         # Empty indices
         index.update_coords(2, None)
@@ -99,13 +140,9 @@ def test_meta_ops(self):
     def test_neighbours(self):
         index = self.backend_cls(2, 1.0, self.device, self.range_kernel)
 
-        level_0_ijk = torch.tensor([
-            [0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]
-        ], dtype=torch.int, device=self.device)
+        level_0_ijk = torch.tensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]], dtype=torch.int, device=self.device)
 
-        level_1_ijk = torch.tensor([
-            [0, 0, 2], [0, 0, 0]
-        ], dtype=torch.int, device=self.device)
+        level_1_ijk = torch.tensor([[0, 0, 2], [0, 0, 0]], dtype=torch.int, device=self.device)
 
         level_0_ijk, perm_0 = index.update_coords(0, level_0_ijk)
         level_1_ijk, perm_1 = index.update_coords(1, level_1_ijk)
@@ -119,100 +156,174 @@ def test_neighbours(self):
         # Ring-3 NN -- same layer
         src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 0, target_range=3)
         nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(nn_info, permute_neighbour(torch.tensor(
-            [[0, 0, 0, 0,  0],
-             [1, 1, 0, 0,  0],
-             [2, 2, 0, 0,  0],
-             [3, 3, 0, 0,  0],
-             [0, 3, 0, 1,  0],
-             [1, 3, 0, 1, -1],
-             [0, 1, 0, 0,  1],
-             [1, 2, 0, 0,  1],
-             [1, 0, 0, 0, -1],
-             [2, 1, 0, 0, -1],
-             [3, 1, 0, -1, 1],
-             [3, 0, 0, -1, 0]],
-            dtype=torch.long, device=self.device), perm_0, perm_0))
-        assert_tensor_equal(n_counts, torch.tensor([
-            4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-        ], dtype=torch.long, device=self.device))
+        assert_tensor_set_equal(
+            nn_info,
+            permute_neighbour(
+                torch.tensor(
+                    [
+                        [0, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [2, 2, 0, 0, 0],
+                        [3, 3, 0, 0, 0],
+                        [0, 3, 0, 1, 0],
+                        [1, 3, 0, 1, -1],
+                        [0, 1, 0, 0, 1],
+                        [1, 2, 0, 0, 1],
+                        [1, 0, 0, 0, -1],
+                        [2, 1, 0, 0, -1],
+                        [3, 1, 0, -1, 1],
+                        [3, 0, 0, -1, 0],
+                    ],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+                perm_0,
+                perm_0,
+            ),
+        )
+        assert_tensor_equal(
+            n_counts,
+            torch.tensor(
+                [4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                dtype=torch.long,
+                device=self.device,
+            ),
+        )
 
         # Ring-1 NN -- across layer
         src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 1, target_range=1)
         nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(nn_info, permute_neighbour(torch.tensor(
-            [[0, 1, 0, 0, 0],
-             [1, 1, 0, 0, 0],
-             [2, 0, 0, 0, 0],
-             [3, 1, 0, 0, 0]],
-            dtype=torch.long, device=self.device), perm_0, perm_1))
+        assert_tensor_set_equal(
+            nn_info,
+            permute_neighbour(
+                torch.tensor(
+                    [[0, 1, 0, 0, 0], [1, 1, 0, 0, 0], [2, 0, 0, 0, 0], [3, 1, 0, 0, 0]],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+                perm_0,
+                perm_1,
+            ),
+        )
         assert_tensor_equal(n_counts, torch.tensor([4], dtype=torch.long, device=self.device))
 
         # Ring-3 NN -- across layer
         src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(0, 1, target_range=3)
         nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(nn_info, permute_neighbour(torch.tensor(
-            [[0, 1, 0, 0, 0],
-             [1, 1, 0, 0, 0],
-             [2, 0, 0, 0, 0],
-             [3, 1, 0, 0, 0],
-             [0, 0, 0, 0, 2],
-             [1, 0, 0, 0, 1],
-             [3, 0, 0, 0, 2],
-             [2, 1, 0, 0, -1]],
-            dtype=torch.long, device=self.device), perm_0, perm_1))
-        assert_tensor_equal(n_counts, torch.tensor([
-            4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3,  # (0,0,2) and (0,0,1) belongs to the same NN.
-            1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-        ], dtype=torch.long, device=self.device))
+        assert_tensor_set_equal(
+            nn_info,
+            permute_neighbour(
+                torch.tensor(
+                    [
+                        [0, 1, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [2, 0, 0, 0, 0],
+                        [3, 1, 0, 0, 0],
+                        [0, 0, 0, 0, 2],
+                        [1, 0, 0, 0, 1],
+                        [3, 0, 0, 0, 2],
+                        [2, 1, 0, 0, -1],
+                    ],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+                perm_0,
+                perm_1,
+            ),
+        )
+        assert_tensor_equal(
+            n_counts,
+            torch.tensor(
+                [
+                    4,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    3,  # (0,0,2) and (0,0,1) belongs to the same NN.
+                    1,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                    0,
+                ],
+                dtype=torch.long,
+                device=self.device,
+            ),
+        )
 
         # Ring-3 NN -- across layer (the other way around)
         src_ids, tgt_ids, n_types, n_counts = index.get_self_neighbours(1, 0, target_range=3)
         nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(nn_info, permute_neighbour(torch.tensor(
-            [[1, 0, 0, 0, 0],
-             [1, 1, 0, 0, 0],
-             [0, 2, 0, 0, 0],
-             [1, 3, 0, 0, 0],
-             [0, 1, 0, 0, 0],
-             [1, 2, 0, 0, 0]], dtype=torch.long, device=self.device), perm_1, perm_0))
+        assert_tensor_set_equal(
+            nn_info,
+            permute_neighbour(
+                torch.tensor(
+                    [
+                        [1, 0, 0, 0, 0],
+                        [1, 1, 0, 0, 0],
+                        [0, 2, 0, 0, 0],
+                        [1, 3, 0, 0, 0],
+                        [0, 1, 0, 0, 0],
+                        [1, 2, 0, 0, 0],
+                    ],
+                    dtype=torch.long,
+                    device=self.device,
+                ),
+                perm_1,
+                perm_0,
+            ),
+        )
 
         # coordinate-based NN (like hash table)
-        src_ids, tgt_ids, n_types, n_counts = index.get_coords_neighbours(torch.tensor([
-            [0., 0., 0.], [0., 0., 2.]
-        ], device=self.device), 1, 0, nn_kernel=self._get_range_kernel(1))
+        src_ids, tgt_ids, n_types, n_counts = index.get_coords_neighbours(
+            torch.tensor([[0.0, 0.0, 0.0], [0.0, 0.0, 2.0]], device=self.device),
+            1,
+            0,
+            nn_kernel=self._get_range_kernel(1),
+        )
         nn_info = torch.cat([src_ids[:, None], tgt_ids[:, None], n_types.long()], dim=1)
-        assert_tensor_set_equal(nn_info, torch.tensor(
-            [[0, 0, 0, 0, 0],
-             [1, 2, 0, 0, 0]], dtype=torch.long, device=self.device))
+        assert_tensor_set_equal(
+            nn_info, torch.tensor([[0, 0, 0, 0, 0], [1, 2, 0, 0, 0]], dtype=torch.long, device=self.device)
+        )
 
     def test_split_splat(self):
         index = self.backend_cls(2, 1.0, self.device, self.range_kernel)
-        index.update_coords(0, torch.tensor([
-            [0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]
-        ], dtype=torch.int, device=self.device))
-        index.update_coords(1, torch.tensor([
-            [0, 0, 2], [0, 0, 0]
-        ], dtype=torch.int, device=self.device))
+        index.update_coords(
+            0, torch.tensor([[0, 0, 0], [0, 0, 1], [0, 0, 2], [0, 1, 0]], dtype=torch.int, device=self.device)
+        )
+        index.update_coords(1, torch.tensor([[0, 0, 2], [0, 0, 0]], dtype=torch.int, device=self.device))
 
         res = index.splat_data(
             # World coordinates...
-            xyz=torch.tensor([
-                [0.5, 0.5, 0.5],
-                [0.5, 0.5, 2.0]
-            ], device=self.device),
+            xyz=torch.tensor([[0.5, 0.5, 0.5], [0.5, 0.5, 2.0]], device=self.device),
             data_depth=0,
-            data=torch.tensor([
-                [10.0, 8.0],
-                [300., 400.]
-            ], device=self.device),
-            check_corr=False
+            data=torch.tensor([[10.0, 8.0], [300.0, 400.0]], device=self.device),
+            check_corr=False,
+        )
+        assert_tensor_equal(
+            res,
+            torch.tensor(
+                [[10.0, 8.0], [150.0, 200.0], [150.0, 200.0], [0.0, 0.0]], dtype=torch.float32, device=self.device
+            ),
         )
-        assert_tensor_equal(res, torch.tensor(
-            [[10., 8.], [150., 200.],
-             [150., 200.], [0., 0.]],
-            dtype=torch.float32, device=self.device))
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_nn.py b/fvdb/tests/unit/test_nn.py
index 87d0e98d22..91b4882eb8 100644
--- a/fvdb/tests/unit/test_nn.py
+++ b/fvdb/tests/unit/test_nn.py
@@ -17,16 +17,16 @@
 from .common import test_expand
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
 
@@ -42,56 +42,58 @@ def dtype_to_atol(dtype: torch.dtype) -> float:
 
 def get_module(module_name: str, is_fvdb: bool = False):
     match module_name:
-        case 'Conv':
+        case "Conv":
             return fvnn.SparseConv3d if is_fvdb else functools.partial(nn.Conv3d, padding=1)
-        case 'MaxPool':
+        case "MaxPool":
             return fvnn.MaxPool if is_fvdb else nn.MaxPool3d
-        case 'GroupNorm':
+        case "GroupNorm":
             return fvnn.GroupNorm if is_fvdb else nn.GroupNorm
-        case 'ReLU':
+        case "ReLU":
             return fvnn.ReLU if is_fvdb else nn.ReLU
-        case 'UpsamplingNearest':
+        case "UpsamplingNearest":
             return fvnn.UpsamplingNearest if is_fvdb else nn.Upsample
         case _:
             raise NotImplementedError
 
 
 class ConvBlock(nn.Sequential):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 order: str,
-                 num_groups: int,
-                 kernel_size: int = 3,
-                 is_fvdb: bool = False):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        order: str,
+        num_groups: int,
+        kernel_size: int = 3,
+        is_fvdb: bool = False,
+    ):
         super().__init__()
         for i, char in enumerate(order):
-            if char == 'r':
-                self.add_module('ReLU', get_module('ReLU', is_fvdb)(inplace=True)) # type: ignore
-            elif char == 'c':
-                self.add_module('Conv', get_module('Conv', is_fvdb)(
-                    in_channels, out_channels, kernel_size, 1, bias='g' not in order)) # type: ignore
-            elif char == 'g':
-                if i < order.index('c'):
+            if char == "r":
+                self.add_module("ReLU", get_module("ReLU", is_fvdb)(inplace=True))  # type: ignore
+            elif char == "c":
+                self.add_module(
+                    "Conv",
+                    get_module("Conv", is_fvdb)(in_channels, out_channels, kernel_size, 1, bias="g" not in order),
+                )  # type: ignore
+            elif char == "g":
+                if i < order.index("c"):
                     num_channels = in_channels
                 else:
                     num_channels = out_channels
                 if num_channels < num_groups:
                     num_groups = 1
-                self.add_module('GroupNorm', get_module('GroupNorm', is_fvdb)(
-                    num_groups=num_groups, num_channels=num_channels, affine=False)) # type: ignore
+                self.add_module(
+                    "GroupNorm",
+                    get_module("GroupNorm", is_fvdb)(num_groups=num_groups, num_channels=num_channels, affine=False),
+                )  # type: ignore
             else:
                 raise NotImplementedError
 
 
 class SparseDoubleConv(nn.Sequential):
-    def __init__(self,
-                 in_channels: int,
-                 out_channels: int,
-                 order: str,
-                 num_groups: int,
-                 encoder: bool,
-                 is_fvdb: bool = False):
+    def __init__(
+        self, in_channels: int, out_channels: int, order: str, num_groups: int, encoder: bool, is_fvdb: bool = False
+    ):
         super().__init__()
         if encoder:
             conv1_in_channels = in_channels
@@ -99,29 +101,31 @@ def __init__(self,
             if conv1_out_channels < in_channels:
                 conv1_out_channels = in_channels
             conv2_in_channels, conv2_out_channels = conv1_out_channels, out_channels
-            self.add_module('MaxPool', get_module('MaxPool', is_fvdb)(kernel_size=2)) # type: ignore
+            self.add_module("MaxPool", get_module("MaxPool", is_fvdb)(kernel_size=2))  # type: ignore
         else:
             # we're in the decoder path, decrease the number of channels in the 1st convolution
             conv1_in_channels, conv1_out_channels = in_channels, out_channels
             conv2_in_channels, conv2_out_channels = out_channels, out_channels
 
-        self.add_module('SingleConv1',
-                        ConvBlock(conv1_in_channels, conv1_out_channels, order, num_groups, is_fvdb=is_fvdb))
-        self.add_module('SingleConv2',
-                        ConvBlock(conv2_in_channels, conv2_out_channels, order, num_groups, is_fvdb=is_fvdb))
+        self.add_module(
+            "SingleConv1", ConvBlock(conv1_in_channels, conv1_out_channels, order, num_groups, is_fvdb=is_fvdb)
+        )
+        self.add_module(
+            "SingleConv2", ConvBlock(conv2_in_channels, conv2_out_channels, order, num_groups, is_fvdb=is_fvdb)
+        )
 
 
 class MyUNet(nn.Module):
     def __init__(self, is_fvdb: bool = False):
         super().__init__()
         self.is_fvdb = is_fvdb
-        self.enc0 = SparseDoubleConv(3, 64, 'gcr', 8, True, is_fvdb=is_fvdb)
-        self.enc1 = SparseDoubleConv(64, 128, 'gcr', 8, True, is_fvdb=is_fvdb)
-        self.enc2 = SparseDoubleConv(128, 256, 'gcr', 8, True, is_fvdb=is_fvdb)
-        self.dec2 = SparseDoubleConv(256, 128, 'gcr', 8, False, is_fvdb=is_fvdb)
-        self.dec1 = SparseDoubleConv(128, 64, 'gcr', 8, False, is_fvdb=is_fvdb)
-        self.dec0 = SparseDoubleConv(64, 1, 'gcr', 8, False, is_fvdb=is_fvdb)
-        self.upsample = get_module('UpsamplingNearest', is_fvdb)(scale_factor=2) # type: ignore
+        self.enc0 = SparseDoubleConv(3, 64, "gcr", 8, True, is_fvdb=is_fvdb)
+        self.enc1 = SparseDoubleConv(64, 128, "gcr", 8, True, is_fvdb=is_fvdb)
+        self.enc2 = SparseDoubleConv(128, 256, "gcr", 8, True, is_fvdb=is_fvdb)
+        self.dec2 = SparseDoubleConv(256, 128, "gcr", 8, False, is_fvdb=is_fvdb)
+        self.dec1 = SparseDoubleConv(128, 64, "gcr", 8, False, is_fvdb=is_fvdb)
+        self.dec0 = SparseDoubleConv(64, 1, "gcr", 8, False, is_fvdb=is_fvdb)
+        self.upsample = get_module("UpsamplingNearest", is_fvdb)(scale_factor=2)  # type: ignore
 
     def forward(self, x_0):
         x_1 = self.enc0(x_0)
@@ -153,18 +157,19 @@ def test_group_norm(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
 
         for affine in (True, False):
             for num_channels in (64, 128):
                 for num_groups in (8, 16, 32, 64):
                     vdb_features = grid.jagged_like(
-                        torch.randn((grid.total_voxels, num_channels),
-                                    device=device, dtype=dtype, requires_grad=True))
+                        torch.randn((grid.total_voxels, num_channels), device=device, dtype=dtype, requires_grad=True)
+                    )
 
                     # Our groupnorm
                     our_gn_op = fvnn.GroupNorm(
-                        num_groups=num_groups, num_channels=num_channels, dtype=dtype, affine=affine).to(device)
+                        num_groups=num_groups, num_channels=num_channels, dtype=dtype, affine=affine
+                    ).to(device)
                     our_gn_input = fvnn.VDBTensor(grid, vdb_features)
                     our_gn_output = our_gn_op(our_gn_input)
                     our_gn_output.feature.jdata.sum().backward()
@@ -172,8 +177,11 @@ def test_group_norm(self, device, dtype, mutable):
 
                     # Pytorch groupnorm
                     torch_gn_op = torch.nn.GroupNorm(
-                        num_groups=num_groups, num_channels=num_channels, dtype=dtype, affine=affine).to(device)
-                    torch_gn_input = vdb_features.jdata.view(1, gsize, gsize, gsize, num_channels).permute(0, 4, 1, 2, 3)
+                        num_groups=num_groups, num_channels=num_channels, dtype=dtype, affine=affine
+                    ).to(device)
+                    torch_gn_input = vdb_features.jdata.view(1, gsize, gsize, gsize, num_channels).permute(
+                        0, 4, 1, 2, 3
+                    )
                     torch_gn_input = torch_gn_input.detach().clone().requires_grad_(True)
                     torch_gn_output = torch_gn_op(torch_gn_input).permute(0, 2, 3, 4, 1).reshape(-1, num_channels)
                     torch_gn_output.sum().backward()
@@ -191,7 +199,7 @@ def test_max_pool(self, device, dtype, mutable):
         gsize = int(1 / vox_size)
         grid = GridBatch(mutable=mutable, device=device)
         grid.set_from_dense_grid(1, [20, 20, 20], voxel_sizes=vox_size, origins=vox_origin)
-        assert grid.total_voxels == 20 ** 3
+        assert grid.total_voxels == 20**3
         grid_vals = torch.randn(grid.total_voxels, 3).to(device).to(dtype)
 
         for pool_factor in (1, 2, 3, 4, 5, 7, 15, 10):
@@ -201,12 +209,14 @@ def test_max_pool(self, device, dtype, mutable):
             grid_vals_coarse = our_pooling_output.feature.jdata
             grid_coarse = our_pooling_output.grid
             self.assertTrue(torch.allclose(grid_coarse.voxel_sizes[0], grid.voxel_sizes[0] * pool_factor))
-            self.assertTrue(torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1)))
+            self.assertTrue(
+                torch.allclose(grid_coarse.origins[0], grid.origins[0] + 0.5 * grid.voxel_sizes[0] * (pool_factor - 1))
+            )
 
             # Pytorch pooling
             torch_pool_op = torch.nn.MaxPool3d(pool_factor, pool_factor, ceil_mode=True)
             # We compy everything to the CPU because it's noticeably faster to iterate and copy this way
-            grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device='cpu', dtype=dtype)
+            grid_vals_t = torch.zeros(gsize, gsize, gsize, 3).to(device="cpu", dtype=dtype)
             grid_ijk_cpu = grid.ijk.jdata.cpu()
             grid_vals_cpu = grid_vals.cpu()
             for i, coord in enumerate(grid_ijk_cpu):
@@ -215,7 +225,7 @@ def test_max_pool(self, device, dtype, mutable):
             grid_vals_t = grid_vals_t.permute(3, 0, 1, 2).contiguous()
             grid_vals_t_coarse = torch_pool_op(grid_vals_t.unsqueeze(0)).squeeze()
 
-            grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device='cpu')
+            grid_vals_coarse_t_flat = torch.zeros_like(grid_vals_coarse, device="cpu")
             grid_coarse_ijk_cpu = grid_coarse.ijk.jdata.cpu()
             for i, coord in enumerate(grid_coarse_ijk_cpu):
                 grid_vals_coarse_t_flat[i] = grid_vals_t_coarse[:, coord[0], coord[1], coord[2]]
@@ -226,7 +236,7 @@ def test_vdbtensor_dense(self):
         dense_tensor = torch.randn(8, 32, 32, 32, 3)
         vdb_tensor = fvnn.VDBTensor.from_dense(dense_tensor)
         self.assertEqual(vdb_tensor.grid_count, 8)
-        self.assertEqual(vdb_tensor.total_voxels, 32 ** 3 * 8)
+        self.assertEqual(vdb_tensor.total_voxels, 32**3 * 8)
         self.assertTrue(torch.allclose(dense_tensor, vdb_tensor.to_dense()))
 
     def test_vdbtensor_arithmetic(self):
@@ -264,11 +274,12 @@ def test_vdbtensor_arithmetic(self):
         self.assertTrue(torch.allclose(v.feature.jdata, v1.feature.jdata / v2.feature.jdata))
 
     def test_conv_backends(self):
-        dtype, device = torch.float32, 'cuda'
-        grid = fvdb.sparse_grid_from_points(fvdb.JaggedTensor([
-            torch.rand(1024, 3, device=device, dtype=dtype) * 2.0 - 1.0
-            for _ in range(8)
-        ]), voxel_sizes=[0.025] * 3, origins=[0.0] * 3)
+        dtype, device = torch.float32, "cuda"
+        grid = fvdb.sparse_grid_from_points(
+            fvdb.JaggedTensor([torch.rand(1024, 3, device=device, dtype=dtype) * 2.0 - 1.0 for _ in range(8)]),
+            voxel_sizes=[0.025] * 3,
+            origins=[0.0] * 3,
+        )
         feature = grid.jagged_like(torch.randn(grid.total_voxels, 16, device=device, dtype=dtype))
         conv_layer = fvnn.SparseConv3d(16, 32).to(device=device, dtype=dtype)
 
@@ -278,23 +289,20 @@ def test_conv_backends(self):
         conv_layer.backend = "halo"
         out_feature_halo = conv_layer(fvnn.VDBTensor(grid, feature)).feature.jdata
 
-        rel_diff_halo = torch.linalg.norm(out_feature_halo - out_feature_default) / \
-            torch.linalg.norm(out_feature_default)
+        rel_diff_halo = torch.linalg.norm(out_feature_halo - out_feature_default) / torch.linalg.norm(
+            out_feature_default
+        )
         self.assertLess(rel_diff_halo, 1e-3)
 
         conv_layer.backend = "dense"
         out_feature_dense = conv_layer(fvnn.VDBTensor(grid, feature)).feature.jdata
 
-        rel_diff_dense = torch.linalg.norm(out_feature_dense - out_feature_default) / \
-            torch.linalg.norm(out_feature_default)
+        rel_diff_dense = torch.linalg.norm(out_feature_dense - out_feature_default) / torch.linalg.norm(
+            out_feature_default
+        )
         self.assertLess(rel_diff_dense, 1e-3)
 
-    @test_expand(list(itertools.product(
-        ['cpu', 'cuda'],
-        [torch.float32, torch.float64],
-        [1, 8],
-        [16, 32]
-    )))
+    @test_expand(list(itertools.product(["cpu", "cuda"], [torch.float32, torch.float64], [1, 8], [16, 32])))
     def test_simple_net(self, device, dtype, batch_size, resolution):
         torch.random.manual_seed(0)
         torch.backends.cuda.matmul.allow_tf32 = False
@@ -306,8 +314,9 @@ def test_simple_net(self, device, dtype, batch_size, resolution):
         dense_in_feature = torch.randn(batch_size, 3, resolution, resolution, resolution, device=device, dtype=dtype)
         dense_out_feature = dense_unet(dense_in_feature)
         dense_out_feature.sum().backward()
-        dense_out_grad = {name: param.grad.clone() for name, param in dense_unet.named_parameters()
-                          if param.grad is not None}
+        dense_out_grad = {
+            name: param.grad.clone() for name, param in dense_unet.named_parameters() if param.grad is not None
+        }
 
         sparse_unet = MyUNet(is_fvdb=True).to(device).to(dtype).requires_grad_(True).train()
         copy_weights(sparse_unet, dense_unet)
@@ -316,11 +325,13 @@ def test_simple_net(self, device, dtype, batch_size, resolution):
         sparse_out_feature = sparse_unet(sparse_in_feature)
         sparse_out_feature_dense = sparse_out_feature.to_dense().permute(0, 4, 3, 2, 1)
         sparse_out_feature.feature.jdata.sum().backward()
-        sparse_out_grad = {name: param.grad.clone() for name, param in sparse_unet.named_parameters()
-                           if param.grad is not None}
+        sparse_out_grad = {
+            name: param.grad.clone() for name, param in sparse_unet.named_parameters() if param.grad is not None
+        }
 
-        rel_error = torch.linalg.norm(sparse_out_feature_dense - dense_out_feature) / \
-            torch.linalg.norm(dense_out_feature)
+        rel_error = torch.linalg.norm(sparse_out_feature_dense - dense_out_feature) / torch.linalg.norm(
+            dense_out_feature
+        )
         # print("Max relative difference (sparse vs dense is %.2e)" % rel_error.item())
         self.assertTrue(rel_error < 1.0e-2)
 
@@ -358,5 +369,7 @@ def test_conv_loop(self, device, dtype, mutable):
             model = fvnn.SparseConv3d(in_channels=4, out_channels=32).to(device=device, dtype=dtype)
 
             model(example_inputs)
-if __name__ == '__main__':
+
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_ray_marching.py b/fvdb/tests/unit/test_ray_marching.py
index 4c38d84876..5b2ce647b5 100644
--- a/fvdb/tests/unit/test_ray_marching.py
+++ b/fvdb/tests/unit/test_ray_marching.py
@@ -12,44 +12,45 @@
 from fvdb import GridBatch, JaggedTensor, sparse_grid_from_ijk
 from fvdb.utils import volume_render
 
-from .common import (make_dense_grid_and_point_data,
-                     random_drop_points_if_mutable,
-                     dtype_to_atol)
+from .common import make_dense_grid_and_point_data, random_drop_points_if_mutable, dtype_to_atol
 
 all_device_combos = [
-    ['cpu', True],
-    ['cuda', True],
-    ['cpu', False],
-    ['cuda', False],
+    ["cpu", True],
+    ["cuda", True],
+    ["cpu", False],
+    ["cuda", False],
 ]
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
+
 class TestRayMarching(unittest.TestCase):
     def setUp(self):
         pass
 
     @parameterized.expand(all_device_dtype_combos)
     def test_segments_with_misses(self, device, dtype, mutable):
-        grid = fvdb.sparse_grid_from_dense(num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0])
+        grid = fvdb.sparse_grid_from_dense(
+            num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0]
+        )
 
-        ray_o = torch.tensor([[100,0,0]]).to(device).to(dtype)
-        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype) # towards the grid
-        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype) # away from the grid
+        ray_o = torch.tensor([[100, 0, 0]]).to(device).to(dtype)
+        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype)  # towards the grid
+        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype)  # away from the grid
 
-        ray_o = ray_o.repeat(10, 1) # shape [10, 3]
-        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1) # shape [10, 3]
+        ray_o = ray_o.repeat(10, 1)  # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1)  # shape [10, 3]
 
         segment = grid.segments_along_rays(ray_o, ray_d, 1, eps=1e-3)
         target_lshape = [[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]
@@ -62,7 +63,7 @@ def test_segments_with_misses(self, device, dtype, mutable):
             for j, tlsj in enumerate(tls):
                 self.assertEqual(sls[j], tlsj)
 
-        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0) # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0)  # shape [10, 3]
         segment = grid.segments_along_rays(ray_o, ray_d, 1, eps=1e-3)
         target_lshape = [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]
         self.assertEqual(len(segment), 1)
@@ -76,14 +77,16 @@ def test_segments_with_misses(self, device, dtype, mutable):
 
     @parameterized.expand(all_device_dtype_combos)
     def test_voxels_with_misses(self, device, dtype, mutable):
-        grid = fvdb.sparse_grid_from_dense(num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0])
+        grid = fvdb.sparse_grid_from_dense(
+            num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0]
+        )
 
-        ray_o = torch.tensor([[100,0,0]]).to(device).to(dtype)
-        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype) # towards the grid
-        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype) # away from the grid
+        ray_o = torch.tensor([[100, 0, 0]]).to(device).to(dtype)
+        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype)  # towards the grid
+        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype)  # away from the grid
 
-        ray_o = ray_o.repeat(10, 1) # shape [10, 3]
-        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1) # shape [10, 3]
+        ray_o = ray_o.repeat(10, 1)  # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1)  # shape [10, 3]
 
         voxels, times = grid.voxels_along_rays(ray_o, ray_d, 1, eps=1e-3)
         target_lshape = [[1, 0, 1, 0, 1, 0, 1, 0, 1, 0]]
@@ -102,7 +105,7 @@ def test_voxels_with_misses(self, device, dtype, mutable):
                 self.assertEqual(sls[j], tlsj)
                 self.assertEqual(vls[j], tlsj)
 
-        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0) # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0)  # shape [10, 3]
         voxels, times = grid.voxels_along_rays(ray_o, ray_d, 1, eps=1e-3)
         target_lshape = [[1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]
         self.assertEqual(len(voxels), 1)
@@ -122,14 +125,16 @@ def test_voxels_with_misses(self, device, dtype, mutable):
 
     @parameterized.expand(all_device_dtype_combos)
     def test_uniform_samples_with_misses(self, device, dtype, mutable):
-        grid = fvdb.sparse_grid_from_dense(num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0])
+        grid = fvdb.sparse_grid_from_dense(
+            num_grids=1, dense_dims=[32, 32, 32], device=device, voxel_sizes=[0.1, 0.1, 0.1], origins=[0, 0, 0]
+        )
 
-        ray_o = torch.tensor([[100,0,0]]).to(device).to(dtype)
-        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype) # towards the grid
-        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype) # away from the grid
+        ray_o = torch.tensor([[100, 0, 0]]).to(device).to(dtype)
+        ray_d_hit = torch.tensor([[-1, 0, 0]]).to(device).to(dtype)  # towards the grid
+        ray_d_nohit = torch.tensor([[1, 0, 0]]).to(device).to(dtype)  # away from the grid
 
-        ray_o = ray_o.repeat(10, 1) # shape [10, 3]
-        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1) # shape [10, 3]
+        ray_o = ray_o.repeat(10, 1)  # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit, ray_d_nohit], dim=0).repeat(5, 1)  # shape [10, 3]
 
         t_min = torch.zeros(ray_o.shape[0]).to(ray_o)
         t_max = torch.ones(ray_o.shape[0]).to(ray_o) * 1e10
@@ -145,7 +150,7 @@ def test_uniform_samples_with_misses(self, device, dtype, mutable):
             for j, tlsj in enumerate(tls):
                 self.assertEqual(sls[j], tlsj)
 
-        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0) # shape [10, 3]
+        ray_d = torch.cat([ray_d_hit.repeat(5, 1), ray_d_nohit.repeat(5, 1)], dim=0)  # shape [10, 3]
         segment = grid.uniform_ray_samples(ray_o, ray_d, t_min, t_max, 0.5, eps=1e-3)
         target_lshape = [[8, 8, 8, 8, 8, 0, 0, 0, 0, 0]] if dtype != torch.float16 else [[7, 7, 7, 7, 7, 0, 0, 0, 0, 0]]
         self.assertEqual(len(segment), 1)
@@ -165,17 +170,19 @@ def test_volume_render(self, device, dtype, mutable):
 
         pts = torch.rand(10000, 3).to(device=device, dtype=dtype) - 0.5
         grid = GridBatch(mutable=mutable, device=device)
-        grid.set_from_points(pts, [-1]*3, [1]*3, vox_size, vox_origin)
+        grid.set_from_points(pts, [-1] * 3, [1] * 3, vox_size, vox_origin)
         random_drop_points_if_mutable(grid, drop_pct=0.3)
         grid_dual = grid.dual_grid()
 
         def make_ray_grid(origin, nrays, minb=(-0.45, -0.45), maxb=(0.45, 0.45)):
-            ray_o = torch.tensor([origin] * nrays**2) #+ p.mean(0, keepdim=True)
+            ray_o = torch.tensor([origin] * nrays**2)  # + p.mean(0, keepdim=True)
             ray_d = torch.from_numpy(
-                np.stack([a.ravel() for a in
-                        np.mgrid[minb[0]:maxb[0]:nrays*1j,
-                                minb[1]:maxb[1]:nrays*1j]] +
-                        [np.ones(nrays**2)], axis=-1).astype(np.float32))
+                np.stack(
+                    [a.ravel() for a in np.mgrid[minb[0] : maxb[0] : nrays * 1j, minb[1] : maxb[1] : nrays * 1j]]
+                    + [np.ones(nrays**2)],
+                    axis=-1,
+                ).astype(np.float32)
+            )
             ray_d /= torch.norm(ray_d, dim=-1, keepdim=True)
 
             ray_o, ray_d = ray_o.to(device).to(dtype), ray_d.to(device).to(dtype)
@@ -212,8 +219,7 @@ def volume_render_pytorch(sigma, color, dt, t, pack_info, t_threshold):
         ray_o, ray_d = make_ray_grid((0.0, 0.0, -1.0), 8)
         tmin = torch.zeros(ray_o.shape[0]).to(ray_o)
         tmax = torch.ones(ray_o.shape[0]).to(ray_o) * 1e10
-        ray_intervals = grid.uniform_ray_samples(
-            ray_o, ray_d, tmin, tmax, step_size)
+        ray_intervals = grid.uniform_ray_samples(ray_o, ray_d, tmin, tmax, step_size)
 
         # FIXME: Francis -- this is a hack before I the fix volume_render API
         ray_idx = ray_intervals.jidx.int()
@@ -221,8 +227,7 @@ def volume_render_pytorch(sigma, color, dt, t, pack_info, t_threshold):
         ray_intervals = ray_intervals.jdata
 
         ray_t = ray_intervals.mean(1)
-        ray_mids = grid.uniform_ray_samples(
-            ray_o, ray_d, tmin, tmax, step_size, return_midpoints=True).jdata
+        ray_mids = grid.uniform_ray_samples(ray_o, ray_d, tmin, tmax, step_size, return_midpoints=True).jdata
         self.assertTrue(torch.allclose(ray_mids, ray_t, atol=dtype_to_atol(dtype)))
 
         ray_delta_t = ray_intervals[:, 1] - ray_intervals[:, 0]
@@ -233,9 +238,9 @@ def volume_render_pytorch(sigma, color, dt, t, pack_info, t_threshold):
 
         assert isinstance(sigma_samples, torch.Tensor)  # Fix type errors
 
-        rgb1, depth1, opacity, ws, tot_samples = volume_render(sigma_samples.squeeze(), rgb_samples,
-                                                               ray_delta_t, ray_t, pack_info,
-                                                               t_threshold)
+        rgb1, depth1, opacity, ws, tot_samples = volume_render(
+            sigma_samples.squeeze(), rgb_samples, ray_delta_t, ray_t, pack_info, t_threshold
+        )
         loss = rgb1.sum() + depth1.sum()
         loss.backward()
 
@@ -250,9 +255,7 @@ def volume_render_pytorch(sigma, color, dt, t, pack_info, t_threshold):
 
         rgb_samples = grid_dual.sample_trilinear(ray_pts, grid_data_rgb).jdata
         sigma_samples = grid_dual.sample_trilinear(ray_pts, grid_data_sigma).jdata
-        rgb2, depth2 = volume_render_pytorch(sigma_samples, rgb_samples,
-                                             ray_delta_t, ray_t, pack_info,
-                                             t_threshold)
+        rgb2, depth2 = volume_render_pytorch(sigma_samples, rgb_samples, ray_delta_t, ray_t, pack_info, t_threshold)
         loss = rgb2.sum() + depth2.sum()
         loss.backward()
         rgb_2_grad = grid_data_rgb.grad.detach().clone()
@@ -282,10 +285,10 @@ def test_world_to_dual(self, device, include_end_segments: bool):
         grid = GridBatch(device=device)
         grid.set_from_dense_grid(1, [2, 2, 2])
 
-        rays_o = torch.tensor([[-0.6, 0., 0.]], device=device)
-        rays_d = torch.tensor([[1., 0., 0.]], device=device)
-        nears = torch.tensor([0.], device=device)
-        fars = torch.tensor([5.], device=device)
+        rays_o = torch.tensor([[-0.6, 0.0, 0.0]], device=device)
+        rays_d = torch.tensor([[1.0, 0.0, 0.0]], device=device)
+        nears = torch.tensor([0.0], device=device)
+        fars = torch.tensor([5.0], device=device)
         step_size = 0.4
         cone_angle = 0.0
         if include_end_segments:
@@ -293,9 +296,7 @@ def test_world_to_dual(self, device, include_end_segments: bool):
         else:
             t_targets = torch.tensor([0.0, 0.4, 0.8, 1.2, 1.6, 2.0], device=device)
 
-        intervals = grid.uniform_ray_samples(
-            rays_o, rays_d, nears, fars, step_size, cone_angle, include_end_segments
-        )
+        intervals = grid.uniform_ray_samples(rays_o, rays_d, nears, fars, step_size, cone_angle, include_end_segments)
         middles = grid.uniform_ray_samples(
             rays_o, rays_d, nears, fars, step_size, cone_angle, include_end_segments, return_midpoints=True
         ).jdata
@@ -329,12 +330,16 @@ def test_uniform_step_size_first_step_is_multiple_of_step_size(self, device, dty
         tmax = torch.ones_like(tmin) * 1e10
 
         step_size = 0.01
-        ray_times_inside = grid.uniform_ray_samples(ray_o_inside, ray_d_inside, tmin, tmax, step_size, include_end_segments=False)
+        ray_times_inside = grid.uniform_ray_samples(
+            ray_o_inside, ray_d_inside, tmin, tmax, step_size, include_end_segments=False
+        )
         ray_idx, ray_times_inside = ray_times_inside.jidx.long(), ray_times_inside.jdata
         nsteps_inside = (ray_times_inside - tmin[ray_idx, None]) / step_size
         self.assertTrue(torch.allclose(nsteps_inside, torch.round(nsteps_inside), atol=dtype_to_atol(dtype)))
 
-        ray_times_inside = grid.uniform_ray_samples(ray_o_outside, ray_d_outside, tmin, tmax, step_size, include_end_segments=False)
+        ray_times_inside = grid.uniform_ray_samples(
+            ray_o_outside, ray_d_outside, tmin, tmax, step_size, include_end_segments=False
+        )
         ray_idx, ray_times_inside = ray_times_inside.jidx.long(), ray_times_inside.jdata
         nsteps_outside = (ray_times_inside - tmin[ray_idx, None]) / step_size
         self.assertTrue(torch.allclose(nsteps_outside, torch.round(nsteps_outside), atol=dtype_to_atol(dtype)))
@@ -343,11 +348,9 @@ def test_uniform_step_size_first_step_is_multiple_of_step_size(self, device, dty
     def test_segments_along_rays_bug(self, device, dtype, mutable):
         data_path = os.path.join(os.path.dirname(__file__), os.path.pardir, "data")
         data = torch.load(os.path.join(data_path, "repro_bug.pth"))
-        grid = sparse_grid_from_ijk(data['ijk'].to(device), voxel_sizes=data['vox_size'], origins=data['vox_origin'])
-        ray_o: torch.Tensor = torch.load(
-            os.path.join(data_path, "ray_o.pth")).to(device=device, dtype=dtype)
-        ray_d: torch.Tensor = torch.load(
-            os.path.join(data_path, "ray_d.pth")).to(device=device, dtype=dtype)
+        grid = sparse_grid_from_ijk(data["ijk"].to(device), voxel_sizes=data["vox_size"], origins=data["vox_origin"])
+        ray_o: torch.Tensor = torch.load(os.path.join(data_path, "ray_o.pth")).to(device=device, dtype=dtype)
+        ray_d: torch.Tensor = torch.load(os.path.join(data_path, "ray_d.pth")).to(device=device, dtype=dtype)
 
         segments = grid.segments_along_rays(ray_o.to(dtype), ray_d.to(dtype), 100, 0.0)
 
@@ -358,7 +361,7 @@ def test_segments_along_rays_bug(self, device, dtype, mutable):
 
     @parameterized.expand(all_device_dtype_combos)
     def test_segments_along_rays_always_sorted(self, device, dtype, mutable):
-        for eps in [0., 1e-5]:
+        for eps in [0.0, 1e-5]:
             pts = torch.rand(10000, 3).to(device=device, dtype=dtype)
             grid = GridBatch(mutable=mutable).to(device)
             grid.set_from_points(pts, (0, 0, 0), (1, 1, 1), 0.0001, torch.zeros(3))
@@ -375,13 +378,15 @@ def test_segments_along_rays_always_sorted(self, device, dtype, mutable):
                     continue
                 segments_i = segments_i.jdata
                 self.assertTrue(torch.all(segments_i[:, 1] - segments_i[:, 0] >= eps))
-                self.assertTrue(torch.all(segments_i[1:, 0] - segments_i[:-1, 0] >= eps),
-                                f"mismatch eps = {eps}, diff = {segments_i[1:, 0] >= segments_i[:-1, 0]}, vals = {segments_i}, (1) = {segments_i[1:, 0]}, (2) = {segments_i[:-1, 0]}")
+                self.assertTrue(
+                    torch.all(segments_i[1:, 0] - segments_i[:-1, 0] >= eps),
+                    f"mismatch eps = {eps}, diff = {segments_i[1:, 0] >= segments_i[:-1, 0]}, vals = {segments_i}, (1) = {segments_i[1:, 0]}, (2) = {segments_i[:-1, 0]}",
+                )
                 self.assertTrue(torch.all(segments_i[1:, 1] - segments_i[:-1, 1] >= eps))
 
     @parameterized.expand(all_device_dtype_combos)
     def test_segments_along_rays_always_sorted_batched(self, device, dtype, mutable):
-        for eps in [0., 1e-5]:
+        for eps in [0.0, 1e-5]:
             pts = fvdb.JaggedTensor([torch.rand(10000, 3).to(device=device, dtype=dtype)] * 2)
             grid = GridBatch(mutable=mutable).to(device)
             grid.set_from_points(pts, (0, 0, 0), (1, 1, 1), 0.0001, torch.zeros(3))
@@ -401,8 +406,10 @@ def test_segments_along_rays_always_sorted_batched(self, device, dtype, mutable)
                         continue
                     segments_i = segments_i.jdata
                     self.assertTrue(torch.all(segments_i[:, 1] - segments_i[:, 0] >= eps))
-                    self.assertTrue(torch.all(segments_i[1:, 0] - segments_i[:-1, 0] >= eps),
-                                    f"mismatch eps = {eps}, diff = {segments_i[1:, 0] >= segments_i[:-1, 0]}, vals = {segments_i}, (1) = {segments_i[1:, 0]}, (2) = {segments_i[:-1, 0]}")
+                    self.assertTrue(
+                        torch.all(segments_i[1:, 0] - segments_i[:-1, 0] >= eps),
+                        f"mismatch eps = {eps}, diff = {segments_i[1:, 0] >= segments_i[:-1, 0]}, vals = {segments_i}, (1) = {segments_i[1:, 0]}, (2) = {segments_i[:-1, 0]}",
+                    )
                     self.assertTrue(torch.all(segments_i[1:, 1] - segments_i[:-1, 1] >= eps))
 
     @parameterized.expand(all_device_dtype_combos)
@@ -416,8 +423,8 @@ def test_segments_along_rays_batch_size_mismatch_throws(self, device, dtype, mut
         rays_o = -torch.ones(100, 3).to(device).to(dtype)
         rays_d = pts[:100] - rays_o
         rays_d /= torch.norm(rays_d, dim=-1, keepdim=True)
-        rays_o = fvdb.JaggedTensor([rays_o]*2)
-        rays_d = fvdb.JaggedTensor([rays_d]*2)
+        rays_o = fvdb.JaggedTensor([rays_o] * 2)
+        rays_d = fvdb.JaggedTensor([rays_d] * 2)
 
         with self.assertRaises(ValueError):
             segments = grid.segments_along_rays(rays_o, rays_d, 100, eps=1e-4)
@@ -445,13 +452,18 @@ def test_voxels_along_rays_always_sorted(self, device, dtype, mutable):
                 if times_i.rshape[0] == 0:
                     continue
                 times_i, voxels_i = times_i.jdata, voxels_i.jdata
-                self.assertTrue(torch.all(times_i[:, 0] < times_i[:, 1]), f"Max diff = {(times_i[:, 1] - times_i[:, 0]).max().item()}")
+                self.assertTrue(
+                    torch.all(times_i[:, 0] < times_i[:, 1]),
+                    f"Max diff = {(times_i[:, 1] - times_i[:, 0]).max().item()}",
+                )
                 self.assertTrue(torch.all(times_i[1:, 0] > times_i[:-1, 0]))
                 self.assertTrue(torch.all(times_i[1:, 1] > times_i[:-1, 1]))
                 # Should always march
                 max_diff = torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values.cpu().detach().numpy()
-                self.assertTrue(torch.all(torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values >= 1),
-                                f"Max diff = {max_diff, voxels_i.cpu().numpy(), times_i.cpu().numpy()}")
+                self.assertTrue(
+                    torch.all(torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values >= 1),
+                    f"Max diff = {max_diff, voxels_i.cpu().numpy(), times_i.cpu().numpy()}",
+                )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_voxels_along_rays_batch_size_mismatch_throws(self, device, dtype, mutable):
@@ -464,8 +476,8 @@ def test_voxels_along_rays_batch_size_mismatch_throws(self, device, dtype, mutab
         rays_o = -torch.ones(100, 3).to(device).to(dtype)
         rays_d = pts[:100] - rays_o
         rays_d /= torch.norm(rays_d, dim=-1, keepdim=True)
-        rays_o = fvdb.JaggedTensor([rays_o]*2)
-        rays_d = fvdb.JaggedTensor([rays_d]*2)
+        rays_o = fvdb.JaggedTensor([rays_o] * 2)
+        rays_d = fvdb.JaggedTensor([rays_d] * 2)
 
         with self.assertRaises(ValueError):
             out_voxels, out_times = grid.voxels_along_rays(rays_o, rays_d, 100, 1.0e-5)
@@ -474,12 +486,12 @@ def test_voxels_along_rays_batch_size_mismatch_throws(self, device, dtype, mutab
     def test_voxels_along_rays_always_sorted_batched(self, device, dtype, mutable):
         for i in range(3):
             # pts = torch.rand(10000, 3).to(device=device, dtype=dtype)
-            pts = fvdb.JaggedTensor([torch.rand(100, 3).to(device=device, dtype=dtype)]*2)
+            pts = fvdb.JaggedTensor([torch.rand(100, 3).to(device=device, dtype=dtype)] * 2)
             grid = GridBatch(mutable=mutable).to(device)
             grid.set_from_points(pts, (0, 0, 0), (1, 1, 1), 0.01, torch.zeros(3))
             random_drop_points_if_mutable(grid)
 
-            rays_o = [-torch.ones(100, 3).to(device).to(dtype)]*2
+            rays_o = [-torch.ones(100, 3).to(device).to(dtype)] * 2
             rays_d = [pts[i].jdata[:100] - rays_o[i] for i in range(2)]
             rays_d = [r / torch.norm(r, dim=-1, keepdim=True) for r in rays_d]
             rays_o = fvdb.JaggedTensor(rays_o)
@@ -498,15 +510,22 @@ def test_voxels_along_rays_always_sorted_batched(self, device, dtype, mutable):
                         continue
                     times_i, voxels_i = times_i.jdata, voxels_i.jdata
                     # print(times_i)
-                    self.assertTrue(torch.all(times_i[:, 0] < times_i[:, 1]), f"Max diff = {(times_i[:, 1] - times_i[:, 0]).max().item()}")
+                    self.assertTrue(
+                        torch.all(times_i[:, 0] < times_i[:, 1]),
+                        f"Max diff = {(times_i[:, 1] - times_i[:, 0]).max().item()}",
+                    )
                     if times_i[1:, 0].numel() > 0:
-                        self.assertTrue(torch.all(times_i[1:, 0] > times_i[:-1, 0]), f"Max diff = {(times_i[1:, 0] - times_i[:-1, 0]).max().item()}")
+                        self.assertTrue(
+                            torch.all(times_i[1:, 0] > times_i[:-1, 0]),
+                            f"Max diff = {(times_i[1:, 0] - times_i[:-1, 0]).max().item()}",
+                        )
                     # Should always march
                     max_diff = torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values.cpu().detach().numpy()
-                    self.assertTrue(torch.all(torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values >= 1),
-                                    f"Max diff = {max_diff, voxels_i.cpu().numpy(), times_i.cpu().numpy()}")
-
+                    self.assertTrue(
+                        torch.all(torch.max(voxels_i[1:] - voxels_i[:-1], dim=1).values >= 1),
+                        f"Max diff = {max_diff, voxels_i.cpu().numpy(), times_i.cpu().numpy()}",
+                    )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/fvdb/tests/unit/test_sample.py b/fvdb/tests/unit/test_sample.py
index a20de7d51b..d257e05e64 100644
--- a/fvdb/tests/unit/test_sample.py
+++ b/fvdb/tests/unit/test_sample.py
@@ -8,46 +8,52 @@
 import torch
 from parameterized import parameterized
 
-from .common import (make_dense_grid_and_point_data,
-                     make_sparse_grid_and_point_data,
-                     random_drop_points_if_mutable,
-                     sparse_grid_from_dense_cube)
+from .common import (
+    make_dense_grid_and_point_data,
+    make_sparse_grid_and_point_data,
+    random_drop_points_if_mutable,
+    sparse_grid_from_dense_cube,
+)
 
 all_device_dtype_combos = [
-    ['cuda', torch.float16, False],
-    ['cpu', torch.float32, False],
-    ['cuda', torch.float32, False],
-    ['cpu', torch.float64, False],
-    ['cuda', torch.float64, False],
-    ['cuda', torch.float16, True],
-    ['cpu', torch.float32, True],
-    ['cuda', torch.float32, True],
-    ['cpu', torch.float64, True],
-    ['cuda', torch.float64, True]
+    ["cuda", torch.float16, False],
+    ["cpu", torch.float32, False],
+    ["cuda", torch.float32, False],
+    ["cpu", torch.float64, False],
+    ["cuda", torch.float64, False],
+    ["cuda", torch.float16, True],
+    ["cpu", torch.float32, True],
+    ["cuda", torch.float32, True],
+    ["cpu", torch.float64, True],
+    ["cuda", torch.float64, True],
 ]
 
+
 def trilinear_sample_pytorch(fvdb, p, features, is_dual: bool):
     dual_features_grid = fvdb.read_into_dense(features).squeeze(0).permute(3, 2, 1, 0).unsqueeze(0)
     p_in = p.reshape(1, 1, 1, -1, 3)  # [1, 1, 1, N, 3]
-    res = torch.nn.functional.grid_sample(
-        dual_features_grid, p_in, mode='bilinear', align_corners=is_dual
-    ).squeeze().transpose(0, 1)
+    res = (
+        torch.nn.functional.grid_sample(dual_features_grid, p_in, mode="bilinear", align_corners=is_dual)
+        .squeeze()
+        .transpose(0, 1)
+    )
     return res
 
+
 def upsample_pytorch(small_features, scale: int, mode: str):
     # Two differences with nn.UpsamplingBilinear:
     #   1. align_corners = True
     #   2. Boundary padding instead of zero padding.
     feat = small_features.unsqueeze(0)
-    feat = torch.nn.functional.pad(
-        feat, (1, 1, 1, 1, 1, 1), mode='constant', value=0.0)
+    feat = torch.nn.functional.pad(feat, (1, 1, 1, 1, 1, 1), mode="constant", value=0.0)
     big_features = torch.nn.functional.interpolate(
-        feat, scale_factor=scale, mode=mode,
-        align_corners=False if mode == 'trilinear' else None)
+        feat, scale_factor=scale, mode=mode, align_corners=False if mode == "trilinear" else None
+    )
     big_features = big_features[0][:, scale:-scale, scale:-scale, scale:-scale]
 
     return big_features
 
+
 def sample_trilinear_naive(pts, corner_feats, grid):
     device = corner_feats.device
     dtype = corner_feats.dtype
@@ -59,32 +65,30 @@ def sample_trilinear_naive(pts, corner_feats, grid):
     grid_pts = grid.world_to_grid(pts).jdata
     nearest_ijk = torch.floor(grid_pts)
 
-    offsets = torch.tensor(
-        list(itertools.product([0, 1], [0, 1], [0, 1])),
-        device=device, dtype=torch.long
-    )
+    offsets = torch.tensor(list(itertools.product([0, 1], [0, 1], [0, 1])), device=device, dtype=torch.long)
 
     nearest_ijk = nearest_ijk.unsqueeze(1).long() + offsets.unsqueeze(0)
     unique_ijk, ijk_idx = torch.unique(nearest_ijk.reshape(-1, 3), dim=0, return_inverse=True)
     corner_feats_indices = grid.ijk_to_index(nearest_ijk.reshape(-1, 3)).jdata.reshape(-1, 8)
     sel_corner_feats = corner_feats[corner_feats_indices]
-    sel_corner_feats[~grid.coords_in_active_voxel(nearest_ijk.reshape(-1, 3), False).jdata.reshape(-1, 8)] = 0.
+    sel_corner_feats[~grid.coords_in_active_voxel(nearest_ijk.reshape(-1, 3), False).jdata.reshape(-1, 8)] = 0.0
     uvws = torch.abs(grid_pts.unsqueeze(1) - nearest_ijk.to(pts.dtype))
 
-    trilinear_weights = torch.prod(1. - uvws, dim=-1)
+    trilinear_weights = torch.prod(1.0 - uvws, dim=-1)
     interpolated_feats = trilinear_weights.unsqueeze(-1) * sel_corner_feats.to(pts.dtype)
     return torch.sum(interpolated_feats, dim=1).to(dtype)
 
 
 def _bezier(x: torch.Tensor):
     b1 = (x + 1.5) ** 2
-    b2 = -2 * (x ** 2) + 1.5
+    b2 = -2 * (x**2) + 1.5
     b3 = (x - 1.5) ** 2
     m1 = (x >= -1.5) & (x < -0.5)
     m2 = (x >= -0.5) & (x < 0.5)
     m3 = (x >= 0.5) & (x < 1.5)
     return m1 * b1 + m2 * b2 + m3 * b3
 
+
 def sample_bezier_naive(pts, corner_feats, grid):
     device = corner_feats.device
     dtype = corner_feats.dtype
@@ -96,21 +100,19 @@ def sample_bezier_naive(pts, corner_feats, grid):
     grid_pts = grid.world_to_grid(pts).jdata
     nearest_ijk = torch.round(grid_pts)
 
-    offsets = torch.tensor(
-        list(itertools.product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1])),
-        device=device, dtype=torch.long
-    )
+    offsets = torch.tensor(list(itertools.product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1])), device=device, dtype=torch.long)
 
     nearest_ijk = nearest_ijk.unsqueeze(1).long() + offsets.unsqueeze(0)
     unique_ijk, ijk_idx = torch.unique(nearest_ijk.reshape(-1, 3), dim=0, return_inverse=True)
     corner_feats_indices = grid.ijk_to_index(nearest_ijk.reshape(-1, 3)).jdata.reshape(-1, 27)
     sel_corner_feats = corner_feats[corner_feats_indices]
-    sel_corner_feats[~grid.coords_in_active_voxel(nearest_ijk.reshape(-1, 3), False).jdata.reshape(-1, 27)] = 0.
+    sel_corner_feats[~grid.coords_in_active_voxel(nearest_ijk.reshape(-1, 3), False).jdata.reshape(-1, 27)] = 0.0
     bz_dir = _bezier(nearest_ijk.to(pts.dtype) - grid_pts.unsqueeze(1))
     bz_weights = torch.prod(bz_dir, dim=-1)
     interpolated_feats = bz_weights.unsqueeze(-1) * sel_corner_feats.to(pts.dtype)
     return torch.sum(interpolated_feats, dim=1).to(dtype)
 
+
 def splat_trilinear_naive(pts, feats, grid):
     device = feats.device
     dtype = feats.dtype
@@ -121,36 +123,38 @@ def splat_trilinear_naive(pts, feats, grid):
 
     grid_pts = grid.world_to_grid(pts).jdata
     nearest_ijk = torch.floor(grid_pts)
-    offsets = torch.tensor([
-        [0, 0, 0],
-        [0, 0, 1],
-        [0, 1, 0],
-        [0, 1, 1],
-        [1, 0, 0],
-        [1, 0, 1],
-        [1, 1, 0],
-        [1, 1, 1],
-    ], device=device, dtype=torch.long)
+    offsets = torch.tensor(
+        [
+            [0, 0, 0],
+            [0, 0, 1],
+            [0, 1, 0],
+            [0, 1, 1],
+            [1, 0, 0],
+            [1, 0, 1],
+            [1, 1, 0],
+            [1, 1, 1],
+        ],
+        device=device,
+        dtype=torch.long,
+    )
 
     nearest_ijk = nearest_ijk.unsqueeze(1).long() + offsets.unsqueeze(0)
     unique_ijk, ijk_idx = torch.unique(nearest_ijk.reshape(-1, 3), dim=0, return_inverse=True)
     unique_ijk = unique_ijk
     uvws = torch.abs(grid_pts.unsqueeze(1) - nearest_ijk.to(pts.dtype))
 
-    trilinear_weights = torch.prod(1. - uvws, dim=-1)
+    trilinear_weights = torch.prod(1.0 - uvws, dim=-1)
     interpolated_feats = trilinear_weights.unsqueeze(-1) * feats.unsqueeze(-2)
-    sum_interpolated_feats = torch.zeros((unique_ijk.shape[0], feats_dim),
-                                         device=device, dtype=pts.dtype)
-    sum_interpolated_feats.index_add_(
-        0, ijk_idx, interpolated_feats.reshape(-1, feats_dim))
-    output = torch.zeros((grid.ijk.jdata.shape[0], feats_dim),
-                         device=device, dtype=dtype)
+    sum_interpolated_feats = torch.zeros((unique_ijk.shape[0], feats_dim), device=device, dtype=pts.dtype)
+    sum_interpolated_feats.index_add_(0, ijk_idx, interpolated_feats.reshape(-1, feats_dim))
+    output = torch.zeros((grid.ijk.jdata.shape[0], feats_dim), device=device, dtype=dtype)
     mask = grid.coords_in_active_voxel(unique_ijk, False).jdata
     sum_interpolated_feats = sum_interpolated_feats[mask]
     valid_ijk = grid.ijk_to_index(unique_ijk[mask]).jdata
     output[valid_ijk] = sum_interpolated_feats.to(dtype)
     return output
 
+
 def splat_bezier_naive(pts, feats, grid):
     device = feats.device
     dtype = feats.dtype
@@ -162,10 +166,7 @@ def splat_bezier_naive(pts, feats, grid):
     grid_pts = grid.world_to_grid(pts).jdata
     nearest_ijk = torch.round(grid_pts)
 
-    offsets = torch.tensor(
-        list(itertools.product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1])),
-        device=device, dtype=torch.long
-    )
+    offsets = torch.tensor(list(itertools.product([-1, 0, 1], [-1, 0, 1], [-1, 0, 1])), device=device, dtype=torch.long)
 
     nearest_ijk = nearest_ijk.unsqueeze(1).long() + offsets.unsqueeze(0)
     unique_ijk, ijk_idx = torch.unique(nearest_ijk.reshape(-1, 3), dim=0, return_inverse=True)
@@ -173,30 +174,28 @@ def splat_bezier_naive(pts, feats, grid):
     bz_dir = _bezier(nearest_ijk.to(pts.dtype) - grid_pts.unsqueeze(1))
     bz_weights = torch.prod(bz_dir, dim=-1)
     interpolated_feats = bz_weights.unsqueeze(-1) * feats.unsqueeze(-2).to(pts.dtype)
-    sum_interpolated_feats = torch.zeros((unique_ijk.shape[0], feats_dim),
-                                         device=device, dtype=pts.dtype)
-    sum_interpolated_feats.index_add_(
-        0, ijk_idx, interpolated_feats.reshape(-1, feats_dim))
-    output = torch.zeros((grid.ijk.jdata.shape[0], feats_dim),
-                         device=device, dtype=dtype)
+    sum_interpolated_feats = torch.zeros((unique_ijk.shape[0], feats_dim), device=device, dtype=pts.dtype)
+    sum_interpolated_feats.index_add_(0, ijk_idx, interpolated_feats.reshape(-1, feats_dim))
+    output = torch.zeros((grid.ijk.jdata.shape[0], feats_dim), device=device, dtype=dtype)
     mask = grid.coords_in_active_voxel(unique_ijk, False).jdata
     sum_interpolated_feats = sum_interpolated_feats[mask]
     valid_ijk = grid.ijk_to_index(unique_ijk[mask]).jdata
     output[valid_ijk] = sum_interpolated_feats.to(dtype)
     return output
 
+
 class TestSample(unittest.TestCase):
     @parameterized.expand(all_device_dtype_combos)
     def test_trilinear_dense_vs_pytorch(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-2
-            rtol=1e-2
+            atol = 1e-2
+            rtol = 1e-2
         elif dtype == torch.float32:
-            atol=1e-4
-            rtol=1e-4
+            atol = 1e-4
+            rtol = 1e-4
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
         fvdb, fvdb_d, p = make_dense_grid_and_point_data(7, device, dtype, mutable=mutable)
 
         # Primal
@@ -212,10 +211,10 @@ def test_trilinear_dense_vs_pytorch(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = primal_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((fvdb_d.total_voxels, 4), device=device, dtype=dtype)
@@ -230,20 +229,19 @@ def test_trilinear_dense_vs_pytorch(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
-
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_upsample_dense_vs_pytorch(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-2
-            rtol=1e-2
+            atol = 1e-2
+            rtol = 1e-2
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
 
         nvox = 7
         scale = 2
@@ -264,16 +262,16 @@ def test_upsample_dense_vs_pytorch(self, device, dtype, mutable):
         gv = small_features.grad.clone()
         small_features.grad.zero_()
 
-        fp = upsample_pytorch(small_features, scale, 'trilinear')
+        fp = upsample_pytorch(small_features, scale, "trilinear")
         fp.backward(grad_out)
         assert small_features.grad is not None
         gp = small_features.grad.clone()
         small_features.grad.zero_()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
         small_features_vdb = fvdb.read_from_dense(small_features.permute(3, 2, 1, 0).contiguous().unsqueeze(0))
         fvdb_big = fvdb.subdivided_grid(scale)
         big_pos = fvdb_big.grid_to_world(fvdb_big.ijk.type(dtype)).jdata
@@ -283,23 +281,23 @@ def test_upsample_dense_vs_pytorch(self, device, dtype, mutable):
         gv = small_features.grad.clone()
         small_features.grad.zero_()
 
-        fp = upsample_pytorch(small_features, scale, 'nearest')
+        fp = upsample_pytorch(small_features, scale, "nearest")
         fp.backward(grad_out)
         gp = small_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_trilinear_sparse_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-3
-            rtol=1e-3
+            atol = 1e-3
+            rtol = 1e-3
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -319,10 +317,10 @@ def test_trilinear_sparse_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = primal_features.grad.clone()
         primal_features.grad.zero_()
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -336,19 +334,19 @@ def test_trilinear_sparse_vs_brute(self, device, dtype, mutable):
         fp = sample_trilinear_naive(p, dual_features, grid_d)
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_trilinear_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-3
-            rtol=1e-3
+            atol = 1e-3
+            rtol = 1e-3
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -370,10 +368,12 @@ def test_trilinear_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = primal_features.grad.clone()
         primal_features.grad.zero_()
-        self.assertTrue(torch.allclose(fv.jdata, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv.jdata - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv.jdata, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv.jdata - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -387,23 +387,25 @@ def test_trilinear_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         fp = sample_trilinear_naive(p, dual_features, grid_d)
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
-        self.assertTrue(torch.allclose(fv.jdata, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv.jdata - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv.jdata, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv.jdata - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_trilinear_sparse_onbound_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            f_atol=1e-2
-            f_rtol=1e-2
-            g_atol=1e-2
-            g_rtol=1e-2
+            f_atol = 1e-2
+            f_rtol = 1e-2
+            g_atol = 1e-2
+            g_rtol = 1e-2
         else:
-            f_atol=1e-5
-            f_rtol=1e-8
-            g_atol=1e-5
-            g_rtol=1e-8
+            f_atol = 1e-5
+            f_rtol = 1e-8
+            g_atol = 1e-5
+            g_rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, include_boundary_points=True, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -425,10 +427,12 @@ def test_trilinear_sparse_onbound_vs_brute(self, device, dtype, mutable):
         gp = primal_features.grad.clone()
         primal_features.grad.zero_()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -445,23 +449,25 @@ def test_trilinear_sparse_onbound_vs_brute(self, device, dtype, mutable):
         gp = dual_features.grad.clone()
         dual_features.grad.zero_()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_trilinear_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            f_atol=1e-2
-            f_rtol=1e-2
-            g_atol=1e-2
-            g_rtol=1e-2
+            f_atol = 1e-2
+            f_rtol = 1e-2
+            g_atol = 1e-2
+            g_rtol = 1e-2
         else:
-            f_atol=1e-5
-            f_rtol=1e-8
-            g_atol=1e-5
-            g_rtol=1e-8
+            f_atol = 1e-5
+            f_rtol = 1e-8
+            g_atol = 1e-5
+            g_rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, include_boundary_points=True, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -484,10 +490,12 @@ def test_trilinear_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutabl
         gp = primal_features.grad.clone()
         primal_features.grad.zero_()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -505,19 +513,21 @@ def test_trilinear_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutabl
         gp = dual_features.grad.clone()
         dual_features.grad.zero_()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_bezier_sparse_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-1
-            rtol=1e-2
+            atol = 1e-1
+            rtol = 1e-2
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -538,10 +548,10 @@ def test_bezier_sparse_vs_brute(self, device, dtype, mutable):
         assert primal_features.grad is not None
         gp = primal_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -557,19 +567,19 @@ def test_bezier_sparse_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_bezier_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            atol=1e-1
-            rtol=1e-2
+            atol = 1e-1
+            rtol = 1e-2
         else:
-            atol=1e-5
-            rtol=1e-8
+            atol = 1e-5
+            rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(device, dtype, mutable=mutable)
         random_drop_points_if_mutable(grid)
@@ -591,10 +601,10 @@ def test_bezier_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         assert primal_features.grad is not None
         gp = primal_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -611,26 +621,27 @@ def test_bezier_with_grad_sparse_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=atol, rtol=rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=atol, rtol=rtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=atol, rtol=rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_bezier_sparse_onbound_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            f_atol=1e-2
-            f_rtol=1e-2
-            g_atol=1e-1
-            g_rtol=1e-1
+            f_atol = 1e-2
+            f_rtol = 1e-2
+            g_atol = 1e-1
+            g_rtol = 1e-1
         else:
-            f_atol=1e-5
-            f_rtol=1e-8
-            g_atol=1e-5
-            g_rtol=1e-8
+            f_atol = 1e-5
+            f_rtol = 1e-8
+            g_atol = 1e-5
+            g_rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(
-            device, dtype, include_boundary_points=True, mutable=mutable, expand=1)
+            device, dtype, include_boundary_points=True, mutable=mutable, expand=1
+        )
         random_drop_points_if_mutable(grid)
         random_drop_points_if_mutable(grid_d)
 
@@ -649,10 +660,12 @@ def test_bezier_sparse_onbound_vs_brute(self, device, dtype, mutable):
         assert primal_features.grad is not None
         gp = primal_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -668,26 +681,29 @@ def test_bezier_sparse_onbound_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_bezier_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            f_atol=1e-2
-            f_rtol=1e-2
-            g_atol=1e-1
-            g_rtol=1e-1
+            f_atol = 1e-2
+            f_rtol = 1e-2
+            g_atol = 1e-1
+            g_rtol = 1e-1
         else:
-            f_atol=1e-5
-            f_rtol=1e-8
-            g_atol=1e-5
-            g_rtol=1e-8
+            f_atol = 1e-5
+            f_rtol = 1e-8
+            g_atol = 1e-5
+            g_rtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(
-            device, dtype, include_boundary_points=True, mutable=mutable, expand=1)
+            device, dtype, include_boundary_points=True, mutable=mutable, expand=1
+        )
         random_drop_points_if_mutable(grid)
         random_drop_points_if_mutable(grid_d)
 
@@ -707,10 +723,12 @@ def test_bezier_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutable):
         assert primal_features.grad is not None
         gp = primal_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
         # Dual
         dual_features = torch.rand((grid_d.total_voxels, 4), device=device, dtype=dtype)
@@ -727,31 +745,33 @@ def test_bezier_with_grad_sparse_onbound_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         gp = dual_features.grad.clone()
 
-        self.assertTrue(torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol),
-                        f"Max grad error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(
+            torch.allclose(fv, fp, atol=f_atol, rtol=f_rtol), f"Max error is {torch.max(torch.abs(fv - fp))}"
+        )
+        self.assertTrue(
+            torch.allclose(gv, gp, atol=g_atol, rtol=g_rtol), f"Max grad error is {torch.max(torch.abs(gv - gp))}"
+        )
 
     @parameterized.expand(all_device_dtype_combos)
     def test_splat_trilinear_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            fatol=1e-3
-            frtol=1e-4
-            gatol=1e-3
-            grtol=1e-3
+            fatol = 1e-3
+            frtol = 1e-4
+            gatol = 1e-3
+            grtol = 1e-3
         else:
-            fatol=1e-5
-            frtol=1e-8
-            gatol=1e-5
-            grtol=1e-8
+            fatol = 1e-5
+            frtol = 1e-8
+            gatol = 1e-5
+            grtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(
-            device, dtype, include_boundary_points=True, mutable=mutable, expand=1)
+            device, dtype, include_boundary_points=True, mutable=mutable, expand=1
+        )
         random_drop_points_if_mutable(grid)
         random_drop_points_if_mutable(grid_d)
 
-        points_data = torch.randn(p.shape[0], 7, device=device, dtype=dtype,
-                                  requires_grad=True)
+        points_data = torch.randn(p.shape[0], 7, device=device, dtype=dtype, requires_grad=True)
 
         fv = grid.splat_trilinear(p, points_data).jdata
         grad_out = torch.rand_like(fv)
@@ -764,31 +784,29 @@ def test_splat_trilinear_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         assert points_data.grad is not None
         gp = points_data.grad.clone()
-        self.assertTrue(torch.allclose(fv, fp, atol=fatol, rtol=frtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=gatol, rtol=grtol),
-                        f"Max error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=fatol, rtol=frtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(torch.allclose(gv, gp, atol=gatol, rtol=grtol), f"Max error is {torch.max(torch.abs(gv - gp))}")
 
     @parameterized.expand(all_device_dtype_combos)
     def test_splat_bezier_vs_brute(self, device, dtype, mutable):
         if dtype == torch.half:
-            fatol=1e-3
-            frtol=1e-3
-            gatol=1e-2
-            grtol=1e-2
+            fatol = 1e-3
+            frtol = 1e-3
+            gatol = 1e-2
+            grtol = 1e-2
         else:
-            fatol=1e-5
-            frtol=1e-8
-            gatol=1e-5
-            grtol=1e-8
+            fatol = 1e-5
+            frtol = 1e-8
+            gatol = 1e-5
+            grtol = 1e-8
 
         grid, grid_d, p = make_sparse_grid_and_point_data(
-            device, dtype, include_boundary_points=True, mutable=mutable, expand=1)
+            device, dtype, include_boundary_points=True, mutable=mutable, expand=1
+        )
         random_drop_points_if_mutable(grid)
         random_drop_points_if_mutable(grid_d)
 
-        points_data = torch.randn(p.shape[0], 7, device=device, dtype=dtype,
-                                  requires_grad=True)
+        points_data = torch.randn(p.shape[0], 7, device=device, dtype=dtype, requires_grad=True)
 
         fv = grid.splat_bezier(p, points_data).jdata
         grad_out = torch.rand_like(fv)
@@ -801,10 +819,9 @@ def test_splat_bezier_vs_brute(self, device, dtype, mutable):
         fp.backward(grad_out)
         assert points_data.grad is not None
         gp = points_data.grad.clone()
-        self.assertTrue(torch.allclose(fv, fp, atol=fatol, rtol=frtol),
-                        f"Max error is {torch.max(torch.abs(fv - fp))}")
-        self.assertTrue(torch.allclose(gv, gp, atol=gatol, rtol=grtol),
-                        f"Max error is {torch.max(torch.abs(gv - gp))}")
+        self.assertTrue(torch.allclose(fv, fp, atol=fatol, rtol=frtol), f"Max error is {torch.max(torch.abs(fv - fp))}")
+        self.assertTrue(torch.allclose(gv, gp, atol=gatol, rtol=grtol), f"Max error is {torch.max(torch.abs(gv - gp))}")
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()